In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
#importing packages we know we will need upfront
import seaborn as sns

from scipy.stats import iqr

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [None]:
# Uploading imputed data to df from CSV WorkingDataFrame
df = pd.read_csv('LMPDStopsDatFrame.csv', index_col=0)
df.head()

# Data Imputation & Cleaning #

In [None]:
#fill missing values for each column using its own most frequent value
df = df.fillna(df.mode().iloc[0])
df.isnull().sum()

In [None]:
# Creating a copy of the dataset, to recode values to be numerical for data analysis. 
df2 = df.copy(deep=True)
df2.head()

In [None]:
print(df2.nunique())

In [None]:
df2['ACTIVITY RESULTS']=df2['ACTIVITY RESULTS'].replace({'WARNING': 0, 'CITATION ISSUED': 1})
df2.head()

In [None]:
X1 = df2.filter(['TYPE_OF_STOP', 'OFFICER_GENDER', 'DRIVER_GENDER', 'ACTIVITY_BEAT', 'OFFICER_AGE_RANGE', 'DRIVER_AGE_RANGE',
       'WAS_VEHCILE_SEARCHED', 'NUMBER OF PASSENGERS', 'ACTIVITY_DIVISION'])
X2 = df2.filter(['ID', 'CITATION_CONTROL_NUMBER', 'ACTIVITY_DATE', 'ACTIVITY_TIME', 'ACTIVITY_LOCATION', 
        'ACTIVITY_DIVISION', 'REASON_FOR_SEARCH', 'OFFICER_AGE_RANGE', 'DRIVER_AGE_RANGE'])
y = df2.iloc[:, 3]


In [None]:
X1 = pd.get_dummies(X1)
X1.head()

In [None]:
df_concat = pd.concat([X1, X2], axis=1, sort=False)
df_concat.head()

In [None]:
y.head()

In [None]:
#replacing year with 2025 to 2015.. assuming a typo 
df_concat['ACTIVITY_DATE'] = df_concat['ACTIVITY_DATE'].str.replace("2025","2015")

In [None]:
#converting time column data type to Datetime64
df_concat['ACTIVITY_DATE']=df_concat['ACTIVITY_DATE'].astype('Datetime64',copy=True) 

In [None]:
#Adding new column of just the year of the activity date
df_concat['ACTIVITY_MONTH_YEAR'] = df_concat['ACTIVITY_DATE'].dt.strftime('%m/%y')
df_concat['ACTIVITY_MONTH'] = df_concat['ACTIVITY_DATE'].dt.strftime('%m')
df_concat['ACTIVITY_DAY'] = df_concat['ACTIVITY_DATE'].dt.strftime('%d') 
df_concat['ACTIVITY_YEAR'] = df_concat['ACTIVITY_DATE'].dt.strftime('%y')
df_concat.head()

In [None]:
#Adding new column of just the year of the activity date
df_concat['ACTIVITY_MONTH_YEAR'] = df_concat['ACTIVITY_DATE'].dt.strftime('%m/%y')
df_concat['ACTIVITY_MONTH'] = df_concat['ACTIVITY_DATE'].dt.strftime('%m')
df_concat['ACTIVITY_DAY'] = df_concat['ACTIVITY_DATE'].dt.strftime('%d') 
df_concat['ACTIVITY_YEAR'] = df_concat['ACTIVITY_DATE'].dt.strftime('%y')
df_concat.head()

In [None]:
#Changing column Activity_Time from 24 hour to 12 hour 
import datetime
times = df_concat['ACTIVITY_TIME']
df_concat['ACTIVITY_TIME']=[datetime.datetime.strptime(time, "%H:%M:%S").strftime("%I:%M %p") for time in times]

In [None]:
#Replacing 'over 60' with age range 61-70, and 'UNDER 16' with age range 15-10
df_concat = df_concat.replace({'OFFICER_AGE_RANGE': {'OVER 60': '61 - 70'}})
df_concat = df_concat.replace({'DRIVER_AGE_RANGE': {'OVER 60': '61 - 70', 'UNDER 16': '15 - 10'}})
df_concat.head()

In [None]:
#using split_mean() function to split age ranges to calculate mean, added columns'OFFICER_AGE_MEAN' & 'DRIVER_AGE_MEAN'
def split_mean(x):
    split_list = x.split('-')
    mean = (float(split_list[0])+float(split_list[1]))/2
    return mean

df_concat['OFFICER_AGE_MEAN'] = df_concat['OFFICER_AGE_RANGE'].apply(lambda x: split_mean(x))
df_concat['DRIVER_AGE_MEAN'] = df_concat['DRIVER_AGE_RANGE'].apply(lambda x: split_mean(x))

In [None]:
df_concat.head()

In [None]:
df_concat['ACTIVITY_MONTH_YEAR'].value_counts().plot.bar(title="Number of Stops by Month")

from matplotlib.pyplot import figure
figure(num=None, figsize=(100, 60), dpi=80, facecolor='w', edgecolor='k')

In [None]:
# Officer Mean Age histogram  
df_concat['OFFICER_AGE_MEAN'].hist(color = 'blue')

plt.title('Officer Average Age')
plt.xlabel('Average Age')
plt.ylabel('values')


In [None]:
# Distribution of Age in the Dataset 
df_concat['DRIVER_AGE_MEAN'].hist(color='red')

plt.title('Driver Average Age')
plt.xlabel('Average Age')
plt.ylabel('values')


In [None]:
#df_concat.to_csv('ImputedData', ',')

# Logistic Regression 

In [None]:
# Importing sklearn package to run logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# define dependent and independent variables
X = X1

In [None]:
# Train/Test spilt on our data having test size be 25% of the main data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=2019)

In [None]:
# Since the target variable is not 50/50 we are oversampling so that the target option (Churn, Yes or No) is even
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_train_resample, y_train_resample = ros.fit_resample(X_train, y_train)

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(solver='liblinear')

# fit the model with data
logreg = logreg.fit(X_train_resample,y_train_resample)

#
y_pred=logreg.predict(X_test)

In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
#Model Accuracy 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1_Score:",f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred)) #This does not work

In [None]:
#Recursive Feature Elimination 
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
for i in range(1, X1.shape[1]+1):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression(solver='liblinear')
    # create the RFE model and select 3 attributes; Target Variable Y2
    rfe = RFE(model, i)
    rfe = rfe.fit(X1, y)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(dict(zip(X1.columns, rfe.ranking_)))

# DecisionTreeClassifier

In [None]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier()
clf = clf.fit(X_train_resample, y_train_resample)

# Predict the response for test dataset 
y_pred = clf.predict(X_test)

# Model Accuracy 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1_Score:",f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred))

# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kneigh = KNeighborsClassifier()
kneigh = kneigh.fit(X_train_resample, y_train_resample)
y_pred = kneigh.predict(X_test)

In [None]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1_Score:",f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred))

# AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ADA = AdaBoostClassifier(n_estimators=70, random_state=2019)
ADA = ADA.fit(X_train_resample, y_train_resample)
y_pred = ADA.predict(X_test)

In [None]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1_Score:",f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred))

# Cross Validation  

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_resample, y_resample = ros.fit_resample(X, y)

In [None]:
# cross_valid_ex.py

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier


features, targets = X_resample, y_resample

models = []
models.append(('LogisticRegression', LogisticRegression(solver='liblinear')))
#models.append(('KNeighborsClassifier', KNeighborsClassifier()))
#models.append(('SVC', SVC(kernel='rbf', gamma='auto')))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('AdaClassifier', AdaBoostClassifier()))
#models.append(('RandomForest', RandomForestClassifier(n_estimators=100)))


# KFold with 'stratify' option
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv, scoring='roc_auc')
    print("Model:{0}, Score: AUC={1:0.5f}, var={2:0.5f}".format(
        name,
        score.mean(),
        score.var()
        )
    )
cv1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv1, scoring='f1')
    print("Model:{0}, Score: F1={1:0.5f}, var={2:0.5f}".format(
        name,
        score.mean(),
        score.var()
        )
    )

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

features, targets = X_resample, y_resample
models = []
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=20, min_samples_split=15)))


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv, scoring='roc_auc')
    print("Model:{0}, Score: AUC={1:0.5f}, var={2:0.5f}".format(
        name,
        score.mean(),
        score.var()
        )
    )
cv1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv1, scoring='f1')
    print("Model:{0}, Score: F1={1:0.5f}, var={2:0.5f}".format(
        name,
        score.mean(),
        score.var()
        )
    )    
cv2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv2, scoring='accuracy')
    print("Model:{0}, Score: Accuracy={1:0.5f}, var={2:0.5f}".format(
        name,
        score.mean(),
        score.var()
        )
    )