## **Exercise 1**

#### **The three key elements**
- Overview first: First give a good overview of the data
- Zoom and filter: Allow for exploration in the data
- Detials on demand: Show more if needed

#### **How is explanitory different from exploritory**
Explanitary is used when the end result is already known, and you want to communicate your findings with the audience. 

## **Exercise 2**

In [175]:

import pandas as pd

from bokeh.plotting import figure
from bokeh.io import show, output_notebook, reset_output
from bokeh.models import ColumnDataSource, Legend
from bokeh.models.widgets import Tabs, Panel
from bokeh.io import curdoc
from bokeh.layouts import layout
import numpy as np


output_notebook()

### **Visualization 1**

In [176]:
variables_of_interest = [
    # Variables
    "age",
    "sex",
    "race",
    "juv_fel_count",
    "juv_misd_count",
    "juv_other_count",
    "priors_count",

    # Feature
    "two_year_recid"  
]

In [177]:
df = pd.read_csv("../files/recidivism_dataset_sub.csv")

df = df[(df.is_recid > -1) & 
        (df.c_charge_degree != "0") & 
        (-30 <= df.days_b_screening_arrest) & 
        (df.days_b_screening_arrest <= 30)]\
        .reset_index()\
        .drop("index", axis=1)
df = df[variables_of_interest]
df.shape

(6172, 8)

In [178]:
# Creating age groups
bins = [0, 20, 40, 60, 80, 100]
labels = [f"({x},{y}]" for x,y in zip(bins[:-1], bins[1:])]
df["age_group"] = pd.cut(df.age, bins=bins, labels=labels, right=True)
df[df.age==20].sample()

Unnamed: 0,age,sex,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,two_year_recid,age_group
2611,20,Male,African-American,0,0,0,0,1,"(0,20]"


In [179]:
df_age = df.groupby("age_group")\
      .agg({"two_year_recid":["sum", "count"]}).reset_index()
df_age.columns = df_age.columns.droplevel(0)
df_age = df_age.rename_axis(None, axis=1)
df_age["frac_recid"] = df_age["sum"]/df_age["count"]
df_age["frac_tot"] = df_age["sum"]/df_age["sum"].sum()
df_age.rename(columns={"":"age_group", "sum": "Recomitted", "count": "Total"}, inplace=True)

In [180]:
src = ColumnDataSource(df_age)





kwargs = dict(
    plot_width = 800,
    plot_height = 500,
    title = "Fraction of recommitted crimes",
    x_axis_label = "Age group",
    y_axis_label = "Fraction",
    x_range=df_age["age_group"].tolist(),
    tooltips = [
        ("Fraction recommitted", "@frac_recid{0.00}"),
        ("Fraction Total", "@frac_tot{0.00}")
    ]
)

p_age = figure(**kwargs)
bar_cols = ["Total", "Recomitted"]
bar = {}
cols = ["darkturquoise", "mediumseagreen"]
for idx, i in enumerate(bar_cols):
    bar[i] = p_age.vbar(x="age_group", top=i, source=df_age, width=.5, legend_label=i, muted=False, color=cols[idx])


items = [(label, [bar[label]]) for label in bar_cols]
p_age.legend.visible = True
legend = Legend(items=items, location="bottom")
p_age.legend.click_policy = "mute"



show(p_age)

In [181]:
df_sex = df.groupby("sex")\
      .agg({"two_year_recid":["sum", "count"]}).reset_index()
df_sex.columns = df_sex.columns.droplevel(0)
df_sex = df_sex.rename_axis(None, axis=1)
df_sex["frac_recid"] = df_sex["sum"]/df_sex["count"]
df_sex["frac_tot"] = df_sex["sum"]/df_sex["sum"].sum()
df_sex.rename(columns={"":"sex", "sum": "Recomitted", "count": "Total"}, inplace=True)
df_sex

Unnamed: 0,sex,Recomitted,Total,frac_recid,frac_tot
0,Female,413,1175,0.351489,0.147027
1,Male,2396,4997,0.479488,0.852973


In [182]:
df_race = df.groupby("race")\
      .agg({"two_year_recid":["sum", "count"]}).reset_index()
df_race.columns = df_race.columns.droplevel(0)
df_race = df_race.rename_axis(None, axis=1)
df_race["frac_recid"] = df_race["sum"]/df_race["count"]
df_race["frac_tot"] = df_race["sum"]/df_race["sum"].sum()
df_race.rename(columns={"":"race", "sum": "Recomitted", "count": "Total"}, inplace=True)
df_race

Unnamed: 0,race,Recomitted,Total,frac_recid,frac_tot
0,African-American,1661,3175,0.52315,0.591314
1,Asian,8,31,0.258065,0.002848
2,Caucasian,822,2103,0.39087,0.292631
3,Hispanic,189,509,0.371316,0.067284
4,Native American,5,11,0.454545,0.00178
5,Other,124,343,0.361516,0.044144


In [183]:
age_src = ColumnDataSource(df_age)
sex_src = ColumnDataSource(df_sex)
rac_src = ColumnDataSource(df_race)

kwargs = dict(
    plot_width = 800,
    plot_height = 500,
    title = "Fraction of recommitted crimes",
    y_axis_label = "Fraction",
    tooltips = [
        ("Fraction recommitted", "@frac_recid{0.00}"),
        ("Fraction Total", "@frac_tot{0.00}")
    ]
)

p_age = figure(x_axis_label="Age Group", x_range=df_age["age_group"].tolist(), **kwargs)
p_sex = figure(x_axis_label="Sex", x_range=df_sex["sex"].tolist(), **kwargs)
p_rac = figure(x_axis_label="Race", x_range=df_race["race"].tolist(), **kwargs)

bar_cols = ["Total", "Recomitted"]
items = [(label, [bar[label]]) for label in bar_cols]

bar_rac, bar_age, bar_sex = {}, {}, {}
cols = ["darkturquoise", "mediumseagreen"]
for idx, i in enumerate(bar_cols):
    bar_age[i] = p_age.vbar(x="age_group", top=i, source=df_age, width=.5, legend_label=i, muted=False, color=cols[idx])
    bar_sex[i] = p_sex.vbar(x="sex", top=i, source=df_sex, width=.5, legend_label=i, muted=False, color=cols[idx])
    bar_rac[i] = p_rac.vbar(x="race", top=i, source=df_race, width=.5, legend_label=i, muted=False, color=cols[idx])

for p in [p_age, p_sex, p_rac]:
    p.legend.visible = True
    legend = Legend(items=items, location="bottom")
    p_age.legend.click_policy = "mute"

l1 = layout([[p_age]])
l2 = layout([[p_sex]])
l3 = layout([[p_rac]])

tab1 = Panel(child=l1, title="Age")
tab2 = Panel(child=l2, title="Sex")
tab3 = Panel(child=l3, title="Race")

tabs = Tabs(tabs=[tab1, tab2, tab3])

curdoc().add_root(tabs)
show(tabs)

### **Visualization 2**

In [199]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [187]:
df = pd.read_csv("../files/recidivism_dataset_sub.csv")

df = df[(df.is_recid > -1) & 
        (df.c_charge_degree != "0") & 
        (-30 <= df.days_b_screening_arrest) & 
        (df.days_b_screening_arrest <= 30)]\
        .reset_index()\
        .drop("index", axis=1)
df = df[variables_of_interest]
data = pd.concat([df, pd.get_dummies(df[["sex", "race"]])], axis=1).drop(["sex", "race"], axis= 1)
data.sample()

Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,two_year_recid,sex_Female,sex_Male,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other
914,38,0,0,0,0,0,0,1,0,0,0,1,0,0


In [197]:
target_col = "two_year_recid"
labels = data[target_col].to_numpy()
features = data.drop(target_col, axis=1)
feature_list = list(features.columns)

In [203]:
train_x, test_x, train_Y, test_Y = train_test_split(features, labels, test_size=.3, random_state=42, stratify=data[target_col])


((4320, 13), (1852, 13))