<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
###############################################################################
###                     1.  Define Working Directory                        ###
###############################################################################
import os
abspath = os.path.abspath("C:/Users/miqui/OneDrive/CSU Classes/Consulting/NLS")
os.chdir(abspath)
os.listdir()
###############################################################################
###                    2. Import Libraries and Data                       ###
###############################################################################
# Machine learning stuff
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 200)

# Import the cleaned dataset from R:
df = pd.read_csv("NEWMallett.csv", header=0)

In [3]:

Categorical = list(df.select_dtypes(include=['object', 'category']).columns)

for name in Categorical:
    print(name, ":")
    print(df[name].value_counts(), "\n")

"Replace I, V, D, R values with np.nan"
df.replace(to_replace=["I", "V", "D", "R"], value=np.nan, inplace=True)
df.replace(to_replace=[np.inf, -np.inf], value=np.nan, inplace=True)

"Convert various categorical variables to numeric"
numeric = ["number_grades_repeated", "number_grades_skipped", "highest_degree", "SAT_math_score_2007",
           "SAT_verbal_score_2007", "ACT_score_2007", "number_schools_attended", "parent_expectation_in_jail_by20",
           "age_first_incarcerated", "debts_20", "debts_25", "debts_30", "debts_35", "total_n_incarcerated",
           "months_longest_incarceration", "months_first_incarceration", "number_jobs_since20"]

df[numeric] = df[numeric].astype(float, errors="ignore")

"Replace I, V, D, R values with np.nan"
Categorical = list(df.select_dtypes(include=['object', 'category']).columns)

for name in Categorical:
    print(name, ":")
    print(df[name].unique(), "\n")

###############################################################################
###                     7. Train-Test Split                                 ###
###############################################################################

"Define X and Y variables"
df.columns

# Take rows where hs_grad is not NA:
df = df[df["hs_grad"].notna()]

# Replace Class with your response variable
y = df["hs_grad"]
predictors = ["immigrant", "days_ms_suspension", "black", "NumSchoolsAttended",
              "female", "VictViolentCrime", "Homeless", "HHHospital", "HHJail",
              "ever_in_gang"]  # Add as needed
x = df[predictors]  # Drop any unneeded variables

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.20,  # 80/20 split
    random_state=123,  # Set a random seed for reproducibility
    shuffle=True)



def Impute(df):
    names = df.columns
    imputer = IterativeImputer(max_iter=10, random_state=123)
    clean_df = imputer.fit_transform(X=df)
    clean_df = pd.DataFrame(clean_df)
    clean_df.columns = names
    return clean_df


x_train1 = Impute(x_train)
x_test = Impute(x_test)
###############################################################################
###                      9. Running Our Models                              ###
###############################################################################
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

"Logistic Regression Model:"
predictors = ["immigrant", "hispanic"]  # Add as needed
x = df[predictors]
exog = sm.add_constant(x)

log_reg = sm.Logit(endog=y, exog=exog.astype(float),
                   missing="drop").fit()
print(log_reg.summary())

# Get the odds ratios for each of the parameters:
np.exp(log_reg.params)
###############################################################################
###                      10. Hyper-parameter Tuning for RF                   ###
###############################################################################
help(RandomForestClassifier)

BestModel = RandomForestClassifier(criterion="gini",  # Use gini impurity to measure
                                   max_features="auto",  # Default value (sqrt(n_features))
                                   min_samples_split=3,  # the min. number of samples to split an internal node
                                   max_depth=3,
                                   oob_score=True,  # use the out-of-bag samples to generalize the accuracy?
                                   verbose=1,
                                   warm_start=True)  # reuse the previous fit and add more estimators to it
# Run the model:
BestModel.fit(X=x_train1, y=y_train)

# Check what's available with our newest model:
dir(BestModel)

RF_preds = BestModel.predict(X=x_test)
RF_probs = BestModel.predict_proba(X=x_test)

ever_in_gang :
0    8490
1     472
R      16
D       4
I       2
Name: ever_in_gang, dtype: int64 

victim_breakin_lt12yrs :
0    7476
1    1345
I     126
D      22
R      14
V       1
Name: victim_breakin_lt12yrs, dtype: int64 

victim_bully_lt12yrs :
0    7120
1    1713
I     126
R      13
D      11
V       1
Name: victim_bully_lt12yrs, dtype: int64 

victim_shooting_lt12yrs :
0    7859
1     975
I     126
R      14
D       9
V       1
Name: victim_shooting_lt12yrs, dtype: int64 

special_education_history :
2    5159
V    2752
I     620
1     453
Name: special_education_history, dtype: int64 

bilingual_education_history :
2    5352
V    2752
I     676
1     203
D       1
Name: bilingual_education_history, dtype: int64 

gifted_education_history :
2    4510
V    2752
1    1006
I     715
D       1
Name: gifted_education_history, dtype: int64 

number_grades_repeated :
0    4753
I    1788
V    1253
1     976
2     183
3      29
4       2
Name: number_grades_repeated, dtype: int64 

nu

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [4]:
import graphviz
from sklearn import tree

In [8]:
dot_data = tree.export_graphviz(BestModel.estimators_[99], out_file=None)
graph = graphviz.Source(dot_data)

In [7]:
graph.render(.gv', view=True)