## Notebook Purpose: Model Selection & Model Comparisons
### Tree Based Models
###### - XGBoost 
###### - Random Forest Classifier

### Deep Neural Networks
######   Need Synthetic data for them 

In [150]:
# Importing Libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Internal Libraries
import os
import sys

### Loading In Dataset

In [151]:
df = pd.read_csv('C:/Users/12678/OneDriveDrexelUniversity/Documents/Computer Vision Engineer/Case_Competitions/PhillyCodeFest/dsProject/recidivism_cleaned.csv', sep=',', index_col=0)

### Encoding Categorical Variables

In [152]:
df.columns

Index(['Year Released', 'Year Reviewed', 'Race', 'Age', 'Degree', 'Offense',
       'SubOffense', 'District', 'Release Type', 'Relapse', 'HopefulCase'],
      dtype='object')

In [153]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

#### One Hot Encoding Categorical Values

In [154]:
# Convert Categorical columns to numpy array. 
race = np.array(df["Race"])
degree = np.array(df["Degree"])
offense = np.array(df["Offense"])
suboffense = np.array(df["SubOffense"])
district = np.array(df["District"])
release_type = np.array(df["Release Type"])




race = label_encoder.fit_transform(race)
degree = label_encoder.fit_transform(degree)
offense = label_encoder.fit_transform(offense)
suboffense = label_encoder.fit_transform(suboffense)
district = label_encoder.fit_transform(district)
release_type = label_encoder.fit_transform(release_type)

#### Label Encoding Categorical Values

In [155]:
# Convert Categorical columns to numpy array. 
df["Year Released"] = df["Year Released"].astype('str')
df["Year Reviewed"] = df["Year Reviewed"].astype('str')


year_reviewed = np.array(df["Year Reviewed"])
year_released = np.array(df["Year Released"])


year_reviewed_encoded = label_encoder.fit_transform(year_reviewed)
year_released_encoded = label_encoder.fit_transform(year_released)

In [156]:
# Store binary variables as a numpy array and
reincarcerated = np.array(df["Relapse"])
hopeful_case = np.array(df["HopefulCase"])

#### Combing all the numpy arrays

In [157]:
# unraveling numpy array
variables = [year_reviewed, year_released, race, degree, offense, suboffense, district, release_type, hopeful_case, reincarcerated]

arr = np.zeros((16415, 10))

for i in range(len(variables)):
    arr[:, i] = variables[i]

In [158]:
arr.shape

(16415, 10)

### XGBoost 

In [None]:
X, y = arr[:, :-1], arr[:, -1]
best_result = 0

for x in range(100):

    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=x)


    # Implent XGBoostClassifier with pandas dataframe with X and y
    xgb = XGBClassifier(booster='gbtree', learning_rate=0.1, max_depth=5)

    # Fitting Data
    xgb.fit(xtrain, ytrain)

    y_pred = xgb.predict(xtest)
    prediction = [round(value) for value in y_pred]

    accuracy = accuracy_score(ytest, prediction)

    if accuracy > best_result:
        best_result = accuracy
        best_xgb = xgb
        best_x = x

print("Accuracy: %.2f%%" % (best_result * 100.0))
print("Best X: %d" % best_x)

In [168]:
# Saving Model