In [1]:
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.svm import SVC

In [2]:
# # import the modules
# import pandas as pd
# from sqlalchemy import create_engine

# # SQLAlchemy connectable
# crm = create_engine('sqlite:///chicago_crime.db').connect()

# # table named 'crime_data' will be returned as a dataframe.
# df = pd.read_sql_table('crime_data', crm)
# print(df)

In [3]:
# Loading crime data
file_path = Path("Resources/sample_crime_data.csv")
df = pd.read_csv(file_path)
df = df.set_index("ID")
print(df.shape)
df.head()

(206997, 18)


Unnamed: 0_level_0,ID.1,Block,IUCR,Primary Type,Violence Status,Description,Location Description,Arrest,Domestic,District,Ward,Community Area,FBI Code,Year,Latitude,Longitude,Date,Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6909918,12260346,070XX S EGGLESTON AVE,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,7.0,6.0,68.0,08B,2021,41.766435,-87.635964,2021-01-03,13:23:00
6927718,12263464,080XX S YALE AVE,820,THEFT,NON-VIOLENT,$500 AND UNDER,RESIDENCE,False,False,6.0,17.0,44.0,06,2021,41.748474,-87.630607,2021-01-03,06:59:00
6927807,12259990,056XX W WASHINGTON BLVD,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,15.0,29.0,25.0,08B,2021,41.882224,-87.766076,2021-01-03,00:20:00
6931849,12260669,057XX S RACINE AVE,2022,NARCOTICS,NON-VIOLENT,POSSESS - COCAINE,STREET,True,False,7.0,16.0,67.0,18,2021,41.790069,-87.654769,2021-01-03,20:47:00
6931854,25702,068XX S STONY ISLAND AVE,110,HOMICIDE,VIOLENT,FIRST DEGREE MURDER,STREET,False,False,3.0,5.0,43.0,01A,2021,41.771062,-87.586271,2021-01-03,20:09:00


In [4]:
df['Violence Status'].value_counts()

NON-VIOLENT    135435
VIOLENT         71562
Name: Violence Status, dtype: int64

In [5]:
street_df = df[(df['Location Description'] == 'STREET') & (df['Date'] >= '2015-01-01')]

In [6]:
street_df = street_df.drop(columns=['ID.1', 'Block', 'IUCR', 'Description', 'Primary Type','Location Description','Arrest','Domestic',
                                    'FBI Code', 'Year', 'Ward', 'District', 'Time', 'Latitude', 'Longitude'])
print(street_df.shape)
street_df.head()

(52382, 3)


Unnamed: 0_level_0,Violence Status,Community Area,Date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6931849,NON-VIOLENT,67.0,2021-01-03
6931854,VIOLENT,43.0,2021-01-03
6940399,NON-VIOLENT,43.0,2021-01-03
6940417,NON-VIOLENT,41.0,2021-01-03
6955690,NON-VIOLENT,75.0,2021-01-03


In [7]:
# Loading weather data
file_path = Path("Resources/clean_weather_data.csv")
weather_df = pd.read_csv(file_path)
#df = df.set_index("ID")
print(df.shape)
weather_df.head()

(206997, 18)


Unnamed: 0,Date,Average_Wind_Speed,Average_Temperature,Maximum_Temperature,Minimum_Temperature,Fog_Ice_Freezing_Fog,Heavy_Fog_or_Heavy_Freezing_Fog,Thunder,Hail,Smoke_or_Haze,Mist,Rain,Freezing_Rain,Snow_Snow_Pellets_or_Ice Crystals
0,2010-01-01,10.29,,16.0,5.0,,,,,,,,,1.0
1,2010-01-02,11.86,,11.0,2.0,,,,,,,,,
2,2010-01-03,10.29,,18.0,-1.0,,,,,,,1.0,,1.0
3,2010-01-04,11.41,,18.0,7.0,,,,,,,1.0,,1.0
4,2010-01-05,8.5,,23.0,13.0,,,,,,,1.0,,1.0


In [8]:
weather_df = weather_df[(weather_df['Date'] >= '2015-01-01')]

In [9]:
weather_df = weather_df.drop(columns = ['Heavy_Fog_or_Heavy_Freezing_Fog', 'Mist', 'Freezing_Rain', 'Thunder',
                                        'Snow_Snow_Pellets_or_Ice Crystals', 'Hail', 'Rain',
                                       'Maximum_Temperature', 'Minimum_Temperature'])
print(weather_df.shape)
weather_df.head()

(2566, 5)


Unnamed: 0,Date,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
1826,2015-01-01,14.54,20.0,,
1827,2015-01-02,4.7,26.0,,
1828,2015-01-03,5.59,31.0,1.0,
1829,2015-01-04,15.88,29.0,1.0,1.0
1830,2015-01-05,10.29,2.0,1.0,1.0


In [10]:
merged_df = pd.merge(street_df, weather_df, how='left', on='Date')

In [11]:
print(merged_df.shape)
merged_df.head()

(52382, 7)


Unnamed: 0,Violence Status,Community Area,Date,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
0,NON-VIOLENT,67.0,2021-01-03,6.93,32.0,1.0,
1,VIOLENT,43.0,2021-01-03,6.93,32.0,1.0,
2,NON-VIOLENT,43.0,2021-01-03,6.93,32.0,1.0,
3,NON-VIOLENT,41.0,2021-01-03,6.93,32.0,1.0,
4,NON-VIOLENT,75.0,2021-01-03,6.93,32.0,1.0,


In [12]:
merged_df.columns

Index(['Violence Status', 'Community Area', 'Date', 'Average_Wind_Speed',
       'Average_Temperature', 'Fog_Ice_Freezing_Fog', 'Smoke_or_Haze'],
      dtype='object')

In [13]:
df = merged_df.replace(np.nan,0).drop(columns=['Date'])
df.head()

Unnamed: 0,Violence Status,Community Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
0,NON-VIOLENT,67.0,6.93,32.0,1.0,0.0
1,VIOLENT,43.0,6.93,32.0,1.0,0.0
2,NON-VIOLENT,43.0,6.93,32.0,1.0,0.0
3,NON-VIOLENT,41.0,6.93,32.0,1.0,0.0
4,NON-VIOLENT,75.0,6.93,32.0,1.0,0.0


In [14]:
df.columns

Index(['Violence Status', 'Community Area', 'Average_Wind_Speed',
       'Average_Temperature', 'Fog_Ice_Freezing_Fog', 'Smoke_or_Haze'],
      dtype='object')

In [15]:
df.nunique()

Violence Status          2
Community Area          77
Average_Wind_Speed      68
Average_Temperature     76
Fog_Ice_Freezing_Fog     2
Smoke_or_Haze            2
dtype: int64

In [16]:
y = df['Violence Status']
X = df.drop(columns='Violence Status')
X = pd.get_dummies(X)

In [17]:
print(X.shape)
print(y.shape)
X.head()


(52382, 5)
(52382,)


Unnamed: 0,Community Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
0,67.0,6.93,32.0,1.0,0.0
1,43.0,6.93,32.0,1.0,0.0
2,43.0,6.93,32.0,1.0,0.0
3,41.0,6.93,32.0,1.0,0.0
4,75.0,6.93,32.0,1.0,0.0


In [18]:
X.describe()

Unnamed: 0,Community Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
count,52382.0,52382.0,52382.0,52382.0,52382.0
mean,37.714902,9.086781,55.4303,0.34676,0.150949
std,21.005551,3.508695,19.110416,0.475943,0.358002
min,1.0,0.0,0.0,0.0,0.0
25%,24.0,6.71,40.0,0.0,0.0
50%,32.0,8.72,57.0,0.0,0.0
75%,56.0,10.96,72.0,1.0,0.0
max,77.0,27.96,84.0,1.0,1.0


In [19]:
y.value_counts()

NON-VIOLENT    38149
VIOLENT        14233
Name: Violence Status, dtype: int64

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Balanced Random Forest Classifier

In [21]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1).fit(X_train_scaled, y_train)
y_pred=brf.predict(X_test_scaled)
brf

BalancedRandomForestClassifier(random_state=1)

In [22]:
print('Data for Balanced Random Forest Classifier')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Balanced Random Forest Classifier
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5156765637724742
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,4886,4679
Actual Violent,1693,1838


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.74      0.51      0.52      0.61      0.52      0.27      9565
    VIOLENT       0.28      0.52      0.51      0.37      0.52      0.27      3531

avg / total       0.62      0.51      0.52      0.54      0.52      0.27     13096



In [23]:
for feat, importance in sorted(zip(X.columns, brf.feature_importances_), key = lambda x: x[1], reverse=True):
    print(f'{feat}: ({importance})')

Community Area: (0.5677144037287015)
Average_Wind_Speed: (0.21224657603323094)
Average_Temperature: (0.19642592253054492)
Fog_Ice_Freezing_Fog: (0.014785142222777834)
Smoke_or_Haze: (0.00882795548474484)


In [24]:
brf.predict([[25, 10, 33, 0, 0]])

array(['NON-VIOLENT'], dtype=object)

#### Easy Ensemble Adaboost Classifier

In [25]:
# Train the EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train_scaled, y_train)
y_pred=eec.predict(X_test_scaled)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [26]:
print('Data for Easy Ensemble AdaBoost Classifier')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Easy Ensemble AdaBoost Classifier
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5385203091785209
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,4681,4884
Actual Violent,1456,2075


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.76      0.49      0.59      0.60      0.54      0.28      9565
    VIOLENT       0.30      0.59      0.49      0.40      0.54      0.29      3531

avg / total       0.64      0.52      0.56      0.54      0.54      0.29     13096



In [27]:
eec.predict([[25, 10, 33, 0, 0]])

array(['NON-VIOLENT'], dtype=object)

#### Naive Random Oversampling

In [28]:
from collections import Counter
Counter(y_train)

Counter({'VIOLENT': 10702, 'NON-VIOLENT': 28584})

In [29]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'VIOLENT': 28584, 'NON-VIOLENT': 28584})

In [30]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

ros_model = LogisticRegression(solver='lbfgs', random_state=1)
ros_model.fit(X_resampled, y_resampled)
y_pred = ros_model.predict(X_test)

In [31]:
print('Data for Naive Random Oversampling')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Naive Random Oversampling
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5366165823044728
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,5078,4487
Actual Violent,1616,1915


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.76      0.53      0.54      0.62      0.54      0.29      9565
    VIOLENT       0.30      0.54      0.53      0.39      0.54      0.29      3531

avg / total       0.63      0.53      0.54      0.56      0.54      0.29     13096



In [32]:
ros_model.predict([[25, 10, 33, 0, 0]])

  "X does not have valid feature names, but"


array(['NON-VIOLENT'], dtype=object)

#### SMOTE Oversampling

In [33]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({'VIOLENT': 28584, 'NON-VIOLENT': 28584})

In [34]:
# Train the Logistic Regression model using the resampled data
so_model = LogisticRegression(solver='lbfgs', random_state=1)
so_model.fit(X_resampled, y_resampled)
y_pred = so_model.predict(X_test)

In [35]:
print('Data for SMOTE Oversampling')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for SMOTE Oversampling
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5346989838193652
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,5128,4437
Actual Violent,1648,1883


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.76      0.54      0.53      0.63      0.53      0.29      9565
    VIOLENT       0.30      0.53      0.54      0.38      0.53      0.29      3531

avg / total       0.63      0.54      0.53      0.56      0.53      0.29     13096



In [36]:
so_model.predict([[25, 10, 33, 0, 0]])

  "X does not have valid feature names, but"


array(['NON-VIOLENT'], dtype=object)

#### Undersampling

In [37]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'NON-VIOLENT': 10702, 'VIOLENT': 10702})

In [38]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
cc_model = LogisticRegression(solver='lbfgs', random_state=1)
cc_model.fit(X_resampled, y_resampled)
y_pred = cc_model.predict(X_test)

In [39]:
print('Data for Undersampling')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Non-voilent", "Actual Violent"], 
                     columns=["Predicted Non-violent", "Predicted Violent"])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Undersampling
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5259024282425409
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,4678,4887
Actual Violent,1544,1987


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.75      0.49      0.56      0.59      0.52      0.27      9565
    VIOLENT       0.29      0.56      0.49      0.38      0.52      0.28      3531

avg / total       0.63      0.51      0.54      0.54      0.52      0.27     13096



In [40]:
cc_model.predict([[25, 10, 33, 0, 0]])

  "X does not have valid feature names, but"


array(['NON-VIOLENT'], dtype=object)

#### Combination Sampling

In [41]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'NON-VIOLENT': 15616, 'VIOLENT': 10506})

In [42]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
cs_model = LogisticRegression(solver='lbfgs', random_state=1)
cs_model.fit(X_resampled, y_resampled)
y_pred = cs_model.predict(X_test)

In [43]:
print('Data for Combination (Over and Under) Sampling')
print('---------------------------------------------------------------------')
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')
print('---------------------------------------------------------------------')
print('Confusion Matrix:')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ['Actual High Risk', 'Actual Low Risk'], 
                     columns = ['Predicted High Risk', 'Predicted Low Risk'])
display(cm_df)
print('---------------------------------------------------------------------')
print('Imbalanced Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Data for Combination (Over and Under) Sampling
---------------------------------------------------------------------
Balanced Accuracy Score: 0.5103734927576719
---------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,8344,1221
Actual Low Risk,3007,524


---------------------------------------------------------------------
Imbalanced Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

NON-VIOLENT       0.74      0.87      0.15      0.80      0.36      0.14      9565
    VIOLENT       0.30      0.15      0.87      0.20      0.36      0.12      3531

avg / total       0.62      0.68      0.34      0.64      0.36      0.13     13096



In [46]:
cs_model.predict([[25, 0, 89, 0, 0]])

  "X does not have valid feature names, but"


array(['NON-VIOLENT'], dtype=object)