# Predicting a Burglary and Finding the Best Model with Cost Benefit Analysis

## Section 1 - Import Libraries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style

import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

## Section 2 - Import Data from a URL

In [None]:
url1 = 'https://data63206330.file.core.windows.net/data6320/CantonPoliceDept_HW05.csv?sp=rl&st=2021-02-17T17:59:40Z&se=2023-06-18T17:59:00Z&sv=2020-02-10&sig=OByF%2BPfEuCHPMSlspflhxezpcRUecv3bqqSNSn1Kpp8%3D&sr=f'
df_all=pd.read_csv(url1, index_col=0, header=0)
df_all

In [None]:
url2 = 'https://data63206330.file.core.windows.net/data6320/CantonPoliceDept_HW05_Week49.csv?sp=rl&st=2021-02-17T18:33:51Z&se=2023-06-18T18:33:00Z&sv=2020-02-10&sig=UnolmTRuVCs1mzx%2FyciUlEW8WntrkPdkg5njluGfi%2BM%3D&sr=f'
df_canton_new=pd.read_csv(url2, index_col=0, header=0)
df_canton_new

## Section 3 - Create the Datasets 

### Section 3.1 Create the X and y Datasets

In [None]:
y = df_all['BurgStatus']
y.shape

In [None]:
df_all.columns.values

In [None]:
X = df_all.drop(['Subzone', 'YEAR_WEEK', 'SUB_YEAR_WEEK', 'All_comp',
       'FalseAlarm_comp', 'Arrest_comp', 'Cleared_comp', 'NoContact_comp',
       'NoReport_comp', 'Resolved_comp', 'Filed_comp', 'Calls_comp',
       'BurgAlarm_comp', 'Suspicious_comp', 'Shots_comp', 'Intox_comp',
       'Drugs_comp', 'Assault_comp', 'Armed_comp', 'Disturb_comp',
       'Fireworks_comp', 'Noise_comp', 'Stalking_comp', 'ActualBurg',
       'BurgStatus', 'BurgStatus2','Friday', 'Monday', 'Saturday',
       'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12','call_FW FIREWORKS','disp_Runaway juvenile (entered NCIC)',
       'disp_SAT-SETTLED AMONG SELVES', 'disp_TES-TEST',
       'disp_TI -TOW IN', 'disp_Truancy', 'disp_VA Hospital Alarm (Fire)',],axis=1)
print(X.shape)
X.head()


### Section 3.2 - Create the Training and Test Datasets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Split the data for training and test datasets

In [None]:
sc = preprocessing.StandardScaler()

# Scale the data using Standard Scaler

## Section 4 - Classification using Random Forest

### Section 4.1 - Using Random Forest create a model using the raw data

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score

In [None]:
def modeltraintest(vartrain, vartest, y_train, y_test, model):

    #1) Set the properties for the model (model) - by setting vartrain, vartest, and model
    
    #2) Fit the model with training data
    model.fit(vartrain, y_train)

    #3) Predict the target variable with test data
    model_pred = model.predict(vartest)
    model_prob = model.predict_proba(vartest)

    #4) Assess the accuracy with the test data
    score = model.score(vartest, y_test)

    print('XXXXXXXXXXXXXXXX ACCURACY SCORE XXXXXXXXXXXXXXXXXX')
    print(round(score, 6))
    print("")


    print('XXXXXXXXXXXXXXXX CONFUSION MATRIX XXXXXXXXXXXXXXXX')
    print(confusion_matrix(y_test, model_pred))
    print("")


    print('XXXXXXXXXXXXXX CLASSIFICATION REPORT XXXXXXXXXXXXXX')
    print(classification_report(y_test, model_pred))
    print('')


    print('XXXXXXXXXXXXXX ROC AUC SCORE AND CHART XXXXXXXXXXXXXXXXXX')
    print('')
    y_pred_prob = model.predict_proba(vartest)[:,1]

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    plt.plot([0, 1], [0, 1],'k--')
    plt.plot(fpr, tpr, label='Classification Model')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show();

    # calculate roc curve
    y_pred_prob = model.predict_proba(vartest)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    roc_auc_format = 'ROC AUC Score: {0:.4f}'.format(roc_auc)
    print(roc_auc_format)
    print('')


    print('XXXXXXXXXXXXXX CROSS VALIDATION XXXXXXXXXXXXXXXXXX')
    print('')
    cv_scores = cross_val_score(model, vartrain, y_train, cv=5,
    scoring='accuracy')
    print('CV Accuracy Scores:')
    print(cv_scores)
    print('')
    cv_rocauc = cross_val_score(model, vartrain, y_train, cv=5,
    scoring='roc_auc')
    print('CV ROC AUC:')
    print(cv_rocauc)

    print('')
    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

In [None]:
def shorttraintest(vartrain, vartest, y_train, y_test, model):

    #Fit the model
    model.fit(vartrain, y_train)

    #Predict with the model
    model_pred = model.predict(vartest)
    model_prob = model.predict_proba(vartest)


    print('Confusion Matrix:')
    print(confusion_matrix(y_test, model_pred))
    print("")

    #Assess with the model
    score = model.score(vartest, y_test)
    score_format = 'Accuracy Score: {0:.4f}'.format(score)
    print(score_format)

    recall = recall_score(y_test, model_pred)
    recall_format = 'Recall Score: {0:.4f}'.format(recall)
    print(recall_format)
    
    precision = precision_score(y_test, model_pred)
    precision_format = 'Precision Score: {0:.4f}'.format(precision)
    print(precision_format)
    
    # calculate roc curve
    y_pred_prob = model.predict_proba(vartest)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    roc_auc_format = 'ROC AUC Score: {0:.4f}'.format(roc_auc)
    print(roc_auc_format)
    print('')

In [None]:
vartrain = X_train
vartest = X_test
model_raw = RandomForestClassifier(random_state=21)

modeltraintest(vartrain, vartest, y_train, y_test, model_raw)

# Random forest classifier with default parameters. 
# Goal is to have accuracy score as close to 1 as possible. 81% is decent, but not good.
# Note that recall score needs to be improved.
# Note that folds hover around 81%. Comfortable enough with data to move on.
# Since the folds are mostly even - with no outliers - the data is worthy of training.

In [None]:
vartrain = X_train
vartest = X_test
model_raw = RandomForestClassifier(random_state=21)

shorttraintest(vartrain, vartest, y_train, y_test, model_raw)

# More concise viewing of data, showing confusion matrix as well as scores from above.

In [None]:
vartrain = X_train
vartest = X_test
model_raw = RandomForestClassifier(random_state=21)

shorttraintest(vartrain, vartrain, y_train, y_train, model_raw)

# Set test data to training data to see overfit training data. Shows that parameters need to be set

### Section 4.2 - Fine-tune the Model to Find the Optimal model

In [None]:
#depth = range(6,25)

#Chose to use a list instead of a rnage
depth = [6, 8, 10, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 30]

#Creates an empty list
scores = []

for d in depth:
    classifier=RandomForestClassifier(max_depth = d, random_state = 21)
    classifier=classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))


plt.plot(depth, scores, '-o')
plt.xlabel('depth, d')
plt.ylabel('scores')
plt.xticks(depth)
plt.show()

# run for loops to see a set of random forest to see how many decision trees are needed.
# note that iteration 18 shows the highest accuracy therefore is the best depth.

In [None]:
vartrain = X_train
vartest = X_test
model_opt = RandomForestClassifier(max_depth=18, random_state=21)

shorttraintest(vartrain, vartest, y_train, y_test, model_opt)

# Run a new short train with set max depth to 18.
# Note results are shown quicker. Computer is thankful.

In [None]:
vartrain = X_train
vartest = X_test
model_opt = RandomForestClassifier(max_depth=18, random_state=21)

shorttraintest(vartrain, vartrain, y_train, y_train, model_opt)

# New short train to test for overfit. Note that the new test is slightly better.

In [None]:
figsize=(20, 5)
maxf = range(1,25)
scores = []

for d in maxf:
    classifier=RandomForestClassifier(max_depth = 18, max_features = d, random_state=21)
    classifier=classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))


plt.plot(maxf, scores, '-o')
plt.xlabel('maxf, d')
plt.ylabel('scores')
plt.xticks(maxf)
plt.show()

# run randomforest classifier using the default setting for max features and 18 for max depth.
# note that around iteration 14 the differences in values is minimal

In [None]:
figsize=(20, 5)
est = [1, 20, 50, 100, 150, 500]
scores = []

for d in est:
    classifier=RandomForestClassifier(max_depth = 18, max_features = 14, n_estimators = d, random_state=21)
    classifier=classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))


plt.plot(est, scores, '-o')
plt.xlabel('est, d')
plt.ylabel('scores')
plt.xticks(est)
plt.show()

# new test with set max features to 14 and n_estimators set to default.

In [None]:
vartrain = X_train
vartest = X_test
model_opt = RandomForestClassifier(max_depth = 18, max_features = 14, n_estimators = 50, random_state=21)

shorttraintest(vartrain, vartest, y_train, y_test, model_opt)

# Short test with n_estimators set to 50. Takes longer than 20, but is slightly more accurate

In [None]:
vartrain = X_train
vartest = X_test
model_opt = RandomForestClassifier(max_depth = 18, max_features = 14, n_estimators = 50, class_weight=None)

shorttraintest(vartrain, vartrain, y_train, y_train, model_opt)

# Data is still overfit

In [None]:
vartrain = X_train
vartest = X_test
model_opt = RandomForestClassifier(max_depth = 18, max_features = 14, n_estimators = 50, class_weight='balanced')

shorttraintest(vartrain, vartest, y_train, y_test, model_opt)

# Run test with class weight set to balanced.

In [None]:
cw = [None, 'balanced', {0:1, 1:2}, {0:1, 1:3}, {0:1, 1:5}, {0:1, 1:10}, 
      {0:1, 1:20}, {0:1, 1:25}, {0:1, 1:30}, {0:1, 1:50}, {0:1, 1:100}]

vartrain = X_train
vartest = X_test

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w) 
    varcw = w
    model = RandomForestClassifier(max_depth = 18, max_features = 14, n_estimators = 50, class_weight=varcw)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)


          
print('----------------------')


In [None]:
vartrain = X_train
vartest = X_test
model_opt = RandomForestClassifier(max_depth = 18, max_features = 14, n_estimators = 50, class_weight={0: 1, 1: 10})

modeltraintest(vartrain, vartest, y_train, y_test, model_opt)

# run random forest using manually set properties. 
# chose class weight 0: 1, 1: 10 due to best combined scores

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [None]:
vartrain = X_train
vartest = X_test



grid={"criterion": ['gini', 'entropy'], "max_depth" : [10, 13, 15, 17, 19, 21, 23],
      "n_estimators" : [20], "max_features" : [10, 11, 13, 14, 17],
      "class_weight": [None]}
model_random = RandomForestClassifier(random_state=21)
model_cv=RandomizedSearchCV(model_random,grid,cv=5)
model_cv.fit(vartrain,y_train)

print("tuned hyperparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :", model_cv.best_score_)

# Do a randomized search with n_estimators = 20 for time

In [None]:
vartrain = X_train
vartest = X_test



grid={"criterion": ['gini', 'entropy'], "max_depth" : [10, 13, 15, 17, 19, 21, 23],
      "n_estimators" : [20], "max_features" : [10, 11, 13, 14, 17],
      "class_weight": [None]}
model_random = RandomForestClassifier(random_state=21, n_jobs=-2)
model_cv=RandomizedSearchCV(model_random,grid,cv=5)
model_cv.fit(vartrain,y_train)

print("tuned hyperparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :", model_cv.best_score_)

# include n_jobs of a negative value if you want to save processing time

In [None]:
grid={"max_depth" : [9, 11, 13, 15, 19, 21], "criterion": ['gini', 'entropy'],
      "n_estimators" : [20], "max_features" : [11, 13, 17, 21],
      "class_weight": [None]}
model_grid = RandomForestClassifier(random_state=21, n_jobs=-2)
model_cv=GridSearchCV(model_grid,grid,cv=5)
model_cv.fit(vartrain,y_train)


print("tuned hyperparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

# further tune parameters to create the optimal model

 ### Section 4.3 - Fine Tune the Model to Find the Decision Model

In [None]:
#Code Block 27

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model_dec = RandomForestClassifier(max_depth = 11, max_features = 17, n_estimators = 20, class_weight=None, criterion = 'entropy', random_state=21)

modeltraintest(vartrain, vartest, y_train, y_test, model_dec)

In [None]:
vartrain = X_train
vartest = X_test

#Set the model properties
model_dec = RandomForestClassifier(max_depth = 11, max_features = 17, n_estimators = 100, class_weight=None, criterion = 'entropy', random_state=21)

modeltraintest(vartrain, vartest, y_train, y_test, model_dec)

In [None]:
df_fi = pd.DataFrame(model_dec.feature_importances_)
df_fi

In [None]:
X_names = pd.DataFrame(list(X.columns))

In [None]:
df_feat_imp = pd.concat([df_fi, X_names], axis = 1)
df_feat_imp.columns = ['Importance', 'Features']
df_feat_imp[df_feat_imp['Importance']!=0].sort_values('Importance', ascending = False)

# show feature importance compared to features

In [None]:
cw = [None, 'balanced', {0:1, 1:2}, {0:1, 1:3}, {0:1, 1:5}, {0:1, 1:7.5}, {0:1, 1:10}, {0:1, 1:25},  {0:1, 1:100}]

vartrain = X_train
vartest = X_test

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w) 
    varcw = w
    #Set the model properties
    model = RandomForestClassifier(max_depth = 11, max_features = 17, n_estimators = 20, 
                                   class_weight=varcw, criterion = 'entropy', random_state=21)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)


          
print('----------------------')

# include class weights

### Section 4.4 - Comments throughout Section 4

The models mostly hovered within a tenth of a percent of each other for the raw, optimal, and decision models, but the decision model performed the best

## Section 5 - Cost Benefit Analysis

### Section 5.1 - Create a Summary Table

In [None]:
from sklearn.tree import DecisionTreeClassifier # to build a classification tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.model_selection import cross_val_score # for cross validation
from sklearn.metrics import confusion_matrix, classification_report # to create a confusion matrix and classification report
from sklearn.metrics import plot_confusion_matrix # to draw a confusion matrix
from sklearn import tree
from sklearn.tree import plot_tree # to draw a classification tree
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
sc.fit(X_train)
X_train_sc = sc.transform(X_train)
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.columns)

X_test_sc = sc.transform(X_test)
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.columns)

In [None]:
model = [
         
#Logistic Regression - 2 models
         (X_train_sc, X_test_sc, y_train,'log_opt','Logistic',
          LogisticRegression(C=0.1, class_weight = None, penalty = 'l1', solver = 'liblinear', random_state=21)), 
         (X_train_sc, X_test_sc, y_train,'log_dec', 'Logistic',
          LogisticRegression(C=0.1, class_weight = {0: 1, 1: 4}, penalty = 'l1', solver = 'liblinear', random_state=21)), 
         
#Decision Tree - 2 models
         (X_train, X_test, y_train,'dt_opt', 'DecisionTree',
          DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 44, class_weight=None, criterion = 'gini', random_state = 21)), 
         (X_train, X_test, y_train,'dt_dec', 'DecisionTree',
          DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 44, class_weight={0: 1, 1: 3}, criterion = 'gini', random_state = 21)),

#Random Forest -2 models
         (X_train, X_test, y_train,'rf_opt', 'RandomForest',
          RandomForestClassifier(max_depth = 11, max_features = 17, n_estimators = 20, class_weight=None, criterion = 'entropy', random_state=21)), 
         (X_train, X_test, y_train,'rf_dec', 'RandomForest',
          RandomForestClassifier(max_depth = 17, max_features = 12, n_estimators = 20, class_weight={0: 1, 1: 7.5}, criterion = 'entropy', random_state=21)),
]
          
cm_all = pd.DataFrame(columns=['Model', 'Type','pred_noburg', 'pred_burg', 
'Score', 'Recall', 'Precision', 'F1'])


for tr, tst, yt, n, mod, m in model:
    m.fit(tr, yt) 
    model_pred = m.predict(tst)
    model_prob = m.predict_proba(tst)
    score = m.score(tst, y_test)
    score_format = '{0:.4f}'.format(score)

    
    recall = recall_score(y_test, model_pred)
    recall_format = '{0:.4f}'.format(recall)

    f1 = f1_score(y_test, model_pred)
    f1_format = '{0:.4f}'.format(f1)
    
    precision = precision_score(y_test, model_pred)
    precision_format = '{0:.4f}'.format(precision)

    y_pred_prob = m.predict_proba(tst)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    exec(f'fpr_{n} = fpr')
    exec(f'tpr_{n} = tpr')
    exec(f'thresholds_{n} = thresholds')
    exec(f'{n} = n')
    
    
    cm = pd.DataFrame(confusion_matrix(y_test, model_pred))
    cm=cm.rename(columns = {0:'pred_noburg', 1:'pred_burg'})
    
    exec(f'cm_{n} = cm')
    cm['Model'] = mod
    cm['Type'] = n
    cm['Score'] = score_format
    cm['Recall'] = recall_format
    cm['Precision'] = precision_format
    cm['F1'] = f1_format

    
    cm_all = pd.concat([cm_all, cm], axis=0)  
    print(n + " - Score: " + str(score_format) +  " - Recall: " + 
str(recall_format) + " - Precision: " + str(precision_format) + " - F1: " +
str(f1_format))    
    print('------------------------------------------------------------------------')
    
cm_all = cm_all.reset_index()
cm_all['index'] = np.where(cm_all['index']==0, 'no_burglary', 'burglary')
cm_all = cm_all.rename(columns={'index':'actual'})    
    
display(cm_all)
print('------------------------------------------------------------------------')

# Summary table of the 6 chosen models using prameters from the optimal 
# and decision models from Section 5 and HW04

### Section 5.2 - Understanding the Additional Officer Hours for Making a Wrong Decision

Numbers for  a confusion matrix for the optimal decision tree model are as follows:

-False Negative - 359 X 3 = 1077 additional hours

-False Positive - 303 X 1 = 303 additional hours

Total additional hrs for opt decision tree = 1380

### Section 5.3 - Creating the Additional Officer Hours for Making a Wrong Prediction

In [None]:
cm_all

In [None]:
cm_all.info()

In [None]:
cm_all['noburglary_hrs'] = np.where(cm_all['actual']=='no_burglary', cm_all['pred_burg']*1, 0)
cm_all['burglary_hrs'] = np.where(cm_all['actual']=='burglary', cm_all['pred_noburg']*3, 0)
cm_all

# Create two new columns showing the amount of officer allocated 
# hours for no burlaries and burglaries 

In [None]:
cm_all['pred_noburg'] = cm_all['pred_noburg'].astype(float)
cm_all['pred_burg'] = cm_all['pred_burg'].astype(float)
cm_all['Score'] = cm_all['Score'].astype(float)
cm_all['Recall'] = cm_all['Recall'].astype(float)
cm_all['Precision'] = cm_all['Precision'].astype(float)
cm_all['F1'] = cm_all['F1'].astype(float)
cm_all['noburglary_hrs'] = cm_all['noburglary_hrs'].astype(int)
cm_all['burglary_hrs'] = cm_all['burglary_hrs'].astype(int)
cm_all.info()

# Change Dtypes in order to perform calculations

### Section 5.4 - Group Additional Officer Hours for Making a Wrong Prediction

In [None]:
cm_all_ah = cm_all.groupby('Type')[['noburglary_hrs', 'burglary_hrs']].sum().reset_index()
cm_all_ah

# Sum the hours for no burglary or burglary and reduce the number of columns by half.

In [None]:
cm_all_score = cm_all.groupby('Type')[['Score', 'Recall', 'Precision', 'F1']].mean().reset_index()
cm_all_score

# perform a calculation to find the mean of the listed columns

In [None]:
cm_all_model = cm_all.groupby('Type')['Model'].first().reset_index()
cm_all_model

# create a groupby of type based on model

In [None]:
model_cost = pd.merge(cm_all_score, cm_all_ah, on='Type', how='inner')
model_cost = pd.merge(cm_all_model, model_cost, on='Type', how='inner')
model_cost

# merge columns to create updated summary table 

In [None]:
model_cost['Pred_hrs'] = model_cost['noburglary_hrs'] + model_cost['burglary_hrs']
model_cost

# create a new column showing the total predicted cost by adding
# no burglary hours to burglary hours

# Section 6 - Summarize and Visualize the Result

In [None]:
model_result = model_cost.sort_values(by='Pred_hrs', ascending=True)
model_result

# Sort data in ascending order. This shows the optimal number of allocated
# hours spent per model. Note that dt_dec is the best performing model

In [None]:
plt.figure(figsize=(16,6))

ax = sns.barplot(y = "Pred_hrs", x = "Type",  data = model_result, palette = 'deep', dodge=False)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize='10')

# bar plot of the above data

In [None]:
#Fit the model
vartrain = X_train
vartest = X_test

model_dt_dec = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 44, class_weight={0: 1, 1: 3}, criterion = 'gini', random_state = 21)
model_dt_dec.fit(vartrain, y_train)

#Predict with the model
model_pred = model_dt_dec.predict(vartest)
model_prob = model_dt_dec.predict_proba(vartest)


print('Confusion Matrix:')
cm_best = confusion_matrix(y_test, model_pred)
print(cm_best)
print("")

#Assess with the model

print('')
print('-----------------------------------------------------------------')
print('----------------------CLASSIFICATION REPORT----------------------')
print('-----------------------------------------------------------------')
print(classification_report(y_test, model_pred))
print('-----------------------------------------------------------------')

plt.figure(figsize=(16,6))


plt.subplot(121)

plt.title('Decision Tree - {1: 3.5} - CM', fontweight='bold', color = 'black', fontsize='16', horizontalalignment='center')
chart = sns.heatmap(cm_best, annot=True, cmap="Blues", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(['Good', 'Bad'], fontsize=12)
chart.yaxis.set_ticklabels(['Good', 'Bad'], fontsize=12, va='center')

plt.subplot(122)
rfd_fpr, rfd_tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1],'k--')
plt.plot(rfd_fpr, rfd_tpr, label='Decision Tree')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree ROC Curve')
plt.show();

# Show a confusion matrix for the best model, show classification report,
# show heat map version of confusion matrix, and ROC curve of optimal model

I chose this as the best model because it predicts the minimum amount of additional officer hours of all the models.

The confusion matrix shows that while there are a lot of false positives, the low number of false negatives optimizes the amount of officer hours for the department. 

The model shows that it is better to predict a burglary and spend the officer hours preventing one, than to predict there will not be a burglary and have to spend more officer hours solving it. Although I think it needs some work in making accurate predictions. 

## Section 7 - Predict Week 50

In [None]:
model_dt_dec

In [None]:
df_canton_new

In [None]:
df_canton_new['BurgStatus'].value_counts()

# data from week 49 shows 19 no burglaries and 4 burglaries

In [None]:
y_new_test = df_canton_new['BurgStatus']
vartest = df_canton_new.drop(['Subzone','YEAR_WEEK', 'SUB_YEAR_WEEK', 'All_comp',
       'FalseAlarm_comp', 'Arrest_comp', 'Cleared_comp', 'NoContact_comp',
       'NoReport_comp', 'Resolved_comp', 'Filed_comp', 'Calls_comp',
       'BurgAlarm_comp', 'Suspicious_comp', 'Shots_comp', 'Intox_comp',
       'Drugs_comp', 'Assault_comp', 'Armed_comp', 'Disturb_comp',
       'Fireworks_comp', 'Noise_comp', 'Stalking_comp', 'ActualBurg',
       'BurgStatus', 'BurgStatus2','Friday', 'Monday', 'Saturday',
       'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12','call_FW FIREWORKS','disp_Runaway juvenile (entered NCIC)',
       'disp_SAT-SETTLED AMONG SELVES', 'disp_TES-TEST',
       'disp_TI -TOW IN', 'disp_Truancy', 'disp_VA Hospital Alarm (Fire)'],axis=1)

model = model_dt_dec
model.fit(vartrain, y_train)

model_pred = model.predict(vartest)
model_prob = model.predict_proba(vartest)
print('Confusion Matrix:')
cm = confusion_matrix(y_new_test, model_pred)
print(cm)
print("")

#Assess with the model
print('')
print('-----------------------------------------------------------------')
print('----------------------CLASSIFICATION REPORT----------------------')
print('-----------------------------------------------------------------')
print(classification_report(y_new_test, model_pred))
print('-----------------------------------------------------------------')



plt.figure(figsize=(8,6))



plt.title('New Customers', fontweight='bold', color = 'black', fontsize='16', horizontalalignment='center')
chart = sns.heatmap(cm, annot=True, cmap="Blues", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(['Good', 'Bad'], fontsize=12)
chart.yaxis.set_ticklabels(['Good', 'Bad'], fontsize=12, va='center')

# fit new data to best model and run confusion matrix, classification report,
# and heat map.
# Note that the model predicted perfectly

In [None]:
model_pred = pd.DataFrame(model_pred, columns= ['Pred'])
model_prob = pd.DataFrame(model_prob, columns= ['Prob_Good', 'Prob_Bad'])

# set data frames for final model

In [None]:
df_canton_new=df_canton_new.reset_index()
df_canton_new

# set data frames for final model

In [None]:
df_canton_final=df_canton_new[['Subzone','BurgStatus']]

# set dataframes for final model

In [None]:
df_canton_final

# set dataframes for final model

In [None]:
df_final = pd.concat([df_canton_final, model_pred, model_prob], axis = 1)

# finally, concat data frames for final model

In [None]:
df_final

# show final model first draft

In [None]:
df_final=df_final.set_index('Subzone')

In [None]:
df_final

# final model showing matching burgstatus and pred colums for an accurate 
# prediction as well as the probabilities for further scrutiny

According to the data and the model, there was a predicted burglary at subzones 'ZONE1B', 'ZONE3E', 'ZONE5A', and 'ZONE5B'. According to the model, Chief Croft should allocate officer hours to the subzones mentioned in order to stop a burglary next week.

This was a good prediction. The confusion matrix shows that the model correctly predicted all burglaries and non burglaries.