    IMPORTING DATASET

In [None]:
import pandas as pd
df = pd.read_csv('AZ_cleaned.csv') #, index_col = 0) this line removes the index column but messes with other columns, see change log
df #print(df.head(10)) for the time being, we can have it just as 'df' but I don't think that works in normal jupyter notebook?

    IDENTIFYING COLUMN NAMES

In [None]:
column_names = []
for col in df:
    column_names.append(col)

print(column_names)

    REMOVING ALL ROWS WHICH DO NOT CONTAIN VALUES IN THE 'STOP OUTCOME' OR 'DRIVER RACE' COLUMNS

In [None]:
#removing any columns missing values, or having NaN in the 'stop_outcome' column
df.dropna(how = 'any', subset = ['stop_outcome', 'driver_race'], axis = 0, inplace = True)
df #removes ca. 200 rows, not much, but not doing this will result in problems in the future.

In [None]:
#df = df.rename_axis('id').rename_axis('id', axis='columns') 
""" if having used the index_col = 0 function when importing, this sets the otherwise blank name 
of the 'id' column to be 'id' however the system does not actually read it's name as 'id'. 
No solution found thus far. """
#df

    REMOVING ALL ROWS WHICH EITHER DO NOT CONTAIN ANY VALUES AT ALL, OR ARE THE RAW VERSIONS OF OTHER COLUMNS

In [None]:
def remove_redundancies(df_name):
    redundant_columns = ['id','state', 'stop_date', 'location_raw', 'county_name', 'county_fips', 'fine_grained_location', 
    'driver_age_raw', 'driver_race_raw', 'violation_raw', 'search_type_raw', 'officer_id', 'stop_duration', 'road_number',
    'milepost','vehicle_type', 'police_department', 'stop_time', 'driver_age'] #note: stop time is temporarily removed because the current code doesnt want to read it properly
    df_name = df.drop(columns = redundant_columns, axis = 1, inplace = True)   #note 2: driver_age can be included if values are present in your dataset
    return df_name

remove_redundancies(df)


In [None]:
df

In [None]:
df.isnull().sum()

    BECAUSE WE SUSPECT A CORRELATION BEWTEEN DRIVER RACE, AND STOP OUTCOME, PLOTTING THE DRIVER RACE DISTRIBUTION TO SEE IF IT IS SKEWED TOWARDS CERTAIN GROUP

In [None]:
print(df.value_counts(df['driver_race']))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
 
height = df.value_counts(df['driver_race'])
bars = ('White','Hispanic','Other','Black','Asian')
x_pos = np.arange(len(bars))
 
plt.bar(x_pos, height, color = (0.2,0.7,0.3,0.9))

plt.title('Distribution of driver race')
plt.xlabel('Driver race')
plt.ylabel('Instances (in millions)')
 
plt.xticks(x_pos, bars)  #Create names on the x axis
 
plt.show()

In [None]:
""" from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings

X = df.drop(['stop_outcome'], axis = 1)
y = df['stop_outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

Xd = pd.get_dummies(X, columns = ['driver_gender', 'driver_race','violation', 'search_conducted', 'search_type', 
                                  'contraband_found', 'is_arrested', 'consent_search', 'ethnicity'])
print(Xd.shape[1] - X.shape[1], 'columns added') """

    CREATING TESTING DATAFRAME USING FIRST 100000 VALUES TO DECREASE PROCESSING TIME WHILE TESTING CODE AND PARAMETERS

In [None]:
df_testing = df.head(100000)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings

X = df_testing.drop(['stop_outcome'], axis = 1)
y = df_testing['stop_outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

Xd = pd.get_dummies(X, columns = ['driver_gender', 'driver_race','violation', 'search_conducted', 'search_type', 
                                  'contraband_found', 'is_arrested', 'consent_search', 'ethnicity'])
print(Xd.shape[1] - X.shape[1], 'columns added')

In [None]:
Xd.dropna(axis = 0, inplace = True)
Xd

    HYPERPARAMETER TUNING (TESTING ALL COMBINATIONS OF DIFFERENT PARAMETER VALUES, TO SEE WHAT GIVES THE BEST TRAIN AND TEST RESULTS)

In [None]:
import matplotlib as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn import model_selection, metrics
from sklearn import metrics
%matplotlib inline  

rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()
gbm_model = GradientBoostingClassifier()

def rforest_tuning():
    Trees = [1, 10, 25, 50, 100, 150] #number of trees considered in random forest
    maxFeatures = [1, 5, 10, 15] #number of features considered at each split
    maxDepth = [1, 5, 10, 25, 50, 100] #max number of levels in a tree
    minSamplesSplit = [1.0, 2, 3, 4] #minimum number of samples for a node to split
    minSamplesLeaf = [1, 2, 3] #minimum number of samples required at each leaf node

    parameter_grid = {'n_estimators' : Trees,
                    'max_features' : maxFeatures,
                    'max_depth' : maxDepth,
                    'min_samples_split' : minSamplesSplit,
                    'min_samples_leaf' : minSamplesLeaf}
    
    rf_grid = GridSearchCV(estimator = rf_model, param_grid = parameter_grid, cv = 3, verbose = 2, n_jobs = 4) 
    rf_grid.fit(X_train, y_train)

    print(rf_grid.best_params_)
    print (f'Train Accuracy : {rf_grid.score(X_train,y_train):.3f}')
    print (f'Test Accuracy : {rf_grid.score(X_test,y_test):.3f}')

    # identifying and plotting feature importance
    importance = rf_model.feature_importances_
    for i,j in enumerate(importance):
        print(f'Feature: {i}, Score: {j}%.5f')

    plt.bar([x for x in range(len(importance))], importance)
    plt.show()


def knn_tuning():
    parameter_grid = {'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15],
                      'weights' : ['uniform', 'distance'],
                      'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']}
    knn_grid = GridSearchCV(estimator = knn_model, param_grid = parameter_grid, cv = 3, verbose = 2, n_jobs = -1)
    knn_grid.fit(X_train, y_train)

    print(knn_grid.best_params_)
    print (f'Train Accuracy : {knn_grid.score(X_train,y_train):.3f}')
    print (f'Test Accuracy : {knn_grid.score(X_test,y_test):.3f}')


def GBM_tuning():
    parameter_grid = {'min_samples_split': range(200, 1001, 200),
                      'min_samples_leaf' : range(30, 71, 10),
                      'max_depth' : range(5, 16, 2),
                      'max_features' : ['sqrt'],
                      'subsample' : [0.8]}
    GBM_grid = GridSearchCV(estimator = gbm_model, param_grid = parameter_grid, cv = 3, verbose = 2, n_jobs = 4)
    GBM_grid.fit(X_train, y_train)

    print(GBM_grid.best_params_)
    print (f'Train Accuracy : {GBM_grid.score(X_train,y_train):.3f}')
    print (f'Test Accuracy : {GBM_grid.score(X_test,y_test):.3f}')

In [None]:
rf_model.fit(X_train, y_train)
importance = rf_model.feature_importances_
for i,j in enumerate(importance):
    print(f'Feature: {i}, Score: {j:.5f}')

plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
rf_model.fit(Xd, y)
feature_importances=pd.DataFrame({'features':X.columns,'feature_importance':rf_model.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)

In [None]:
GBM_tuning()

In [None]:
rforest_tuning()

In [None]:
#rf_grid = GridSearchCV(estimator = rf_model, param_grid = parameter_grid, cv = 3, verbose = 2, n_jobs = 4) 
#rf_grid.fit(X_train, y_train) #the 'cv' value is the class validation score, by increasing it, it evalues the model n amount of times, can
                              #improve accuracy, however it does make it take longer to run. With a cv of 10, the accuracy of test is 0.851.

In [None]:
rf_grid.best_params_

In [None]:
print(rf_grid.best_params_)
print (f'Train Accuracy : {rf_grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy : {rf_grid.score(X_test,y_test):.3f}')

                                THE CODE BELOW IS THE FIRST VERSIONS OF BOTH THE PARAMETER OPTIMIZATION AND FEATURE IMPORTANCE GRAPHICS

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xd, y, random_state = 0)

rforest = RandomForestClassifier(max_depth = 50, random_state = 0) #possible parameters :max_depth, min_sample_split, max_leaf_nodes
                                                            #min_samples_leaf, n_estimators, max_sample (bootstrap sample), max_features

rforest.fit(X_train, y_train)
train_score = rforest.score(X_train, y_train)
test_score = rforest.score(X_test, y_test)

print('Train   Test')
print('{:.3f} {:7.3f}'.format(train_score, test_score)) #unsure if these results are correct, is there a way to make it faster?

In [None]:
import matplotlib.pyplot as plt

def specific_parameters():
        user_inputs = []
        parameters = ['max_depth', 'min_sample_split', 'max_leaf_nodes', 'min_samples_leaf', 'n_estimators', 'max_sample', 'max_features']
        for item in parameters:
            user_input = input(f'Please select a value for the parameter {item}:')
            if user_input.type() != 'float':
                user_inputs.append('None')
            else:
                user_inputs.append(user_input)
        return user_inputs

def test_parameter():
    number = [1, 5, 10, 25, 50, 100]
    print('Parameter size   Train   Test')
    results_train = []
    results_test = []
    for i in number:
        model = KNeighborsClassifier(n_neighbors = i, algorithm = 'auto', weights = 'distance')#max_features = 10, min_samples_leaf = 1, min_samples_split= 3, n_estimators= 150, random_state = 0)
        model.fit(X_train, y_train)

        #TRAIN
        train_score = model.score(X_train, y_train)
        results_train.append(train_score)
        
        #TEST
        test_score = model.score(X_test, y_test)
        results_test.append(test_score)

        print('{:8d} {:13.3f} {:7.3f}'.format(i, train_score, test_score))

    x = number
    y = results_train
    plt.plot(x,y)
    y = results_test 
    plt.plot(x,y)

#test_parameter() #note --> random grid can be used to optimize finding the best feature values

In [None]:
def modelfit(algorithm, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #fit the data to the algorithm
    algorithm.fit(dtrain[predictors], dtrain['Disbursed'])
    
    #predict training set
    dtrain_predictions = algorithm.predict(dtrain[predictors])
    dtrain_probabilities = algorithm.predict_proba(dtrain[predictors])[:,1]

    #perform cross validation
    if performCV:
        cv_score = model_selection.cross_val_score(algorithm, dtrain[predictors], dtrain['Disbursed'], cv = cv_folds, scoring = 'roc_auc')
    
    #model results
    print('\nModel Results')
    print('Accuracy : %4g' %metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print('AUC score (train): %f' %metrics.roc_auc_score(dtrain['Disbursed'], dtrain_probabilities))

    if performCV:
        print('CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g' % (np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)))
    
    #feature importance
    if printFeatureImportance:
        feat_imp = pd.Series(algorithm.feature_importances_, predictors).sort_values(ascending = False)
        feat_imp.plot(kind = 'bar', title = 'Feature Importances')
        plt.ylabel('Feature Importance Score')
        
