In [1]:
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import the modules
from config import db_password
from sqlalchemy import create_engine

# SQLAlchemy connectable
cnx = create_engine(f'postgresql://postgres:{db_password}@127.0.0.1:5432/chicago_crime').connect()

# table named 'crime_data' will be returned as a dataframe.
df = pd.read_sql_table('chicago_crime_master', cnx)
df.head()

Unnamed: 0,ID,Primary_Type,Violence_Status,Description,Location_Description,Arrest,Community_Area,Latitude,Longitude,Date,Time,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Thunder,Smoke_or_Haze,Community_Name
0,11555945,MOTOR VEHICLE THEFT,NON-VIOLENT,AUTOMOBILE,STREET,False,61.0,41.795512,-87.651271,2019-01-01,15:20:00,7.61,31.0,1.0,,,New City
1,11556016,DECEPTIVE PRACTICE,NON-VIOLENT,FINANCIAL IDENTITY THEFT OVER $ 300,"VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER...",False,6.0,41.942727,-87.640166,2019-01-01,02:00:00,7.61,31.0,1.0,,,Lake View
2,11553355,CRIMINAL DAMAGE,NON-VIOLENT,CRIMINAL DEFACEMENT,COMMERCIAL / BUSINESS OFFICE,False,5.0,41.937637,-87.688799,2019-01-01,09:30:00,7.61,31.0,1.0,,,North Center
3,11553003,BATTERY,VIOLENT,AGGRAVATED: OTHER DANG WEAPON,"VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER...",False,69.0,41.765004,-87.6255,2019-01-01,04:33:00,7.61,31.0,1.0,,,Greater Grand Crossing
4,11553357,OTHER OFFENSE,NON-VIOLENT,GUN OFFENDER: ANNUAL REGISTRATION,STREET,True,43.0,41.758999,-87.565468,2019-01-01,17:27:00,7.61,31.0,1.0,,,South Shore


In [3]:
df['Violence_Status'].value_counts()

NON-VIOLENT    448861
VIOLENT        219316
Name: Violence_Status, dtype: int64

In [4]:
df.columns

Index(['ID', 'Primary_Type', 'Violence_Status', 'Description',
       'Location_Description', 'Arrest', 'Community_Area', 'Latitude',
       'Longitude', 'Date', 'Time', 'Average_Wind_Speed',
       'Average_Temperature', 'Fog_Ice_Freezing_Fog', 'Thunder',
       'Smoke_or_Haze', 'Community_Name'],
      dtype='object')

In [5]:
df.nunique()

ID                      668177
Primary_Type                33
Violence_Status              2
Description                457
Location_Description       177
Arrest                       2
Community_Area              77
Latitude                233127
Longitude               233075
Date                      1096
Time                      1773
Average_Wind_Speed          83
Average_Temperature         88
Fog_Ice_Freezing_Fog         1
Thunder                      1
Smoke_or_Haze                1
Community_Name              77
dtype: int64

In [6]:
df = df[['Violence_Status', 'Community_Area', 'Average_Wind_Speed', 'Average_Temperature', 'Fog_Ice_Freezing_Fog',
         'Smoke_or_Haze']]
df = df.replace(np.nan,0)
df.head()

Unnamed: 0,Violence_Status,Community_Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
0,NON-VIOLENT,61.0,7.61,31.0,1.0,0.0
1,NON-VIOLENT,6.0,7.61,31.0,1.0,0.0
2,NON-VIOLENT,5.0,7.61,31.0,1.0,0.0
3,VIOLENT,69.0,7.61,31.0,1.0,0.0
4,NON-VIOLENT,43.0,7.61,31.0,1.0,0.0


In [7]:
# Create our features
X = df.drop(columns='Violence_Status')
X = pd.get_dummies(X)
# Create our target
y = df['Violence_Status']

In [8]:
print(X.shape)
print(y.shape)
X.head()


(668177, 5)
(668177,)


Unnamed: 0,Community_Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
0,61.0,7.61,31.0,1.0,0.0
1,6.0,7.61,31.0,1.0,0.0
2,5.0,7.61,31.0,1.0,0.0
3,69.0,7.61,31.0,1.0,0.0
4,43.0,7.61,31.0,1.0,0.0


In [9]:
X.describe()

Unnamed: 0,Community_Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
count,668177.0,668177.0,668177.0,668177.0,668177.0
mean,37.120007,9.414902,53.397425,0.39148,0.150047
std,21.506503,3.414343,19.456951,0.488082,0.357118
min,1.0,0.0,-15.0,0.0,0.0
25%,23.0,6.93,37.0,0.0,0.0
50%,32.0,8.95,54.0,0.0,0.0
75%,54.0,11.41,71.0,1.0,0.0
max,77.0,27.96,87.0,1.0,1.0


In [10]:
# Check the balance of our target values
y.value_counts()

NON-VIOLENT    448861
VIOLENT        219316
Name: Violence_Status, dtype: int64

In [11]:
#Split the data into training and testing
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=1)
# Creating StandardScaler instance and fit it
scaler = StandardScaler().fit(X_train)
# Scaling data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Balanced Random Forest Classifier

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1).fit(X_train_scaled, y_train)
y_pred=brf.predict(X_test_scaled)
brf

BalancedRandomForestClassifier(random_state=1)

In [13]:
print('Data for Balanced Random Forest Classifier')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Balanced Random Forest Classifier
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5363748616668333
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,55977,56439
Actual Violent,23228,31401


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.71      0.50      0.57      0.58      0.53      0.28    112416
    VIOLENT       0.36      0.57      0.50      0.44      0.53      0.29     54629

avg / total       0.59      0.52      0.55      0.54      0.53      0.29    167045



In [14]:
for feat, importance in sorted(zip(X.columns, brf.feature_importances_), key = lambda x: x[1], reverse=True):
    print(f'{feat}: ({importance})')

Community_Area: (0.458550259826015)
Average_Wind_Speed: (0.2923289171914792)
Average_Temperature: (0.2298782535709739)
Fog_Ice_Freezing_Fog: (0.010754582710139658)
Smoke_or_Haze: (0.008487986701392366)


In [15]:
# Test prediction
brf.predict([[25, 10, 33, 0, 0]])

array(['NON-VIOLENT'], dtype=object)

#### Easy Ensemble Adaboost Classifier

In [16]:
# Train the EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train_scaled, y_train)
y_pred=eec.predict(X_test_scaled)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [17]:
print('Data for Easy Ensemble AdaBoost Classifier')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Easy Ensemble AdaBoost Classifier
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5526133910516294
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,52475,59941
Actual Violent,19752,34877


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.73      0.47      0.64      0.57      0.55      0.29    112416
    VIOLENT       0.37      0.64      0.47      0.47      0.55      0.30     54629

avg / total       0.61      0.52      0.58      0.54      0.55      0.30    167045



In [18]:
# Test prediction
eec.predict([[25, 10, 33, 0, 0]])

array(['VIOLENT'], dtype=object)