In [1]:
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC

In [None]:
# # import the modules
# import pandas as pd
# from sqlalchemy import create_engine

# # SQLAlchemy connectable
# cnx = create_engine('sqlite:///contacts.db').connect()

# # table named 'contacts' will be returned as a dataframe.
# df = pd.read_sql_table('contacts', cnx)
# print(df)

In [2]:
# Loading crime data
file_path = Path("Resources/sample_crime_data.csv")
df = pd.read_csv(file_path)
df = df.set_index("ID")
print(df.shape)
df.head()

(206997, 18)


Unnamed: 0_level_0,ID.1,Block,IUCR,Primary Type,Violence Status,Description,Location Description,Arrest,Domestic,District,Ward,Community Area,FBI Code,Year,Latitude,Longitude,Date,Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6909918,12260346,070XX S EGGLESTON AVE,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,7.0,6.0,68.0,08B,2021,41.766435,-87.635964,2021-01-03,13:23:00
6927718,12263464,080XX S YALE AVE,820,THEFT,NON-VIOLENT,$500 AND UNDER,RESIDENCE,False,False,6.0,17.0,44.0,06,2021,41.748474,-87.630607,2021-01-03,06:59:00
6927807,12259990,056XX W WASHINGTON BLVD,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,15.0,29.0,25.0,08B,2021,41.882224,-87.766076,2021-01-03,00:20:00
6931849,12260669,057XX S RACINE AVE,2022,NARCOTICS,NON-VIOLENT,POSSESS - COCAINE,STREET,True,False,7.0,16.0,67.0,18,2021,41.790069,-87.654769,2021-01-03,20:47:00
6931854,25702,068XX S STONY ISLAND AVE,110,HOMICIDE,VIOLENT,FIRST DEGREE MURDER,STREET,False,False,3.0,5.0,43.0,01A,2021,41.771062,-87.586271,2021-01-03,20:09:00


In [3]:
df['Violence Status'].value_counts()

NON-VIOLENT    135435
VIOLENT         71562
Name: Violence Status, dtype: int64

In [4]:
street_df = df[(df['Location Description'] == 'STREET')]

In [5]:
street_df.head()

Unnamed: 0_level_0,ID.1,Block,IUCR,Primary Type,Violence Status,Description,Location Description,Arrest,Domestic,District,Ward,Community Area,FBI Code,Year,Latitude,Longitude,Date,Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6931849,12260669,057XX S RACINE AVE,2022,NARCOTICS,NON-VIOLENT,POSSESS - COCAINE,STREET,True,False,7.0,16.0,67.0,18,2021,41.790069,-87.654769,2021-01-03,20:47:00
6931854,25702,068XX S STONY ISLAND AVE,0110,HOMICIDE,VIOLENT,FIRST DEGREE MURDER,STREET,False,False,3.0,5.0,43.0,01A,2021,41.771062,-87.586271,2021-01-03,20:09:00
6940399,12260693,016XX E 69TH ST,141A,WEAPONS VIOLATION,NON-VIOLENT,UNLAWFUL USE - HANDGUN,STREET,False,False,3.0,5.0,43.0,15,2021,41.769708,-87.585264,2021-01-03,23:47:00
6940417,12262250,055XX S Blackstone Ave,1360,CRIMINAL TRESPASS,NON-VIOLENT,TO VEHICLE,STREET,False,False,2.0,5.0,41.0,26,2021,,,2021-01-03,01:00:00
6955690,12261308,111XX S ASHLAND AVE,0820,THEFT,NON-VIOLENT,$500 AND UNDER,STREET,False,False,22.0,34.0,75.0,06,2021,,,2021-01-03,03:20:00


In [6]:
# Loading weather data
file_path = Path("Resources/clean_weather_data.csv")
weather_df = pd.read_csv(file_path)
#df = df.set_index("ID")
print(df.shape)
weather_df.head()

(206997, 18)


Unnamed: 0,Date,Average_Wind_Speed,Average_Temperature,Maximum_Temperature,Minimum_Temperature,Fog_Ice_Freezing_Fog,Heavy_Fog_or_Heavy_Freezing_Fog,Thunder,Hail,Smoke_or_Haze,Mist,Rain,Freezing_Rain,Snow_Snow_Pellets_or_Ice Crystals
0,2010-01-01,10.29,,16.0,5.0,,,,,,,,,1.0
1,2010-01-02,11.86,,11.0,2.0,,,,,,,,,
2,2010-01-03,10.29,,18.0,-1.0,,,,,,,1.0,,1.0
3,2010-01-04,11.41,,18.0,7.0,,,,,,,1.0,,1.0
4,2010-01-05,8.5,,23.0,13.0,,,,,,,1.0,,1.0


In [7]:
merged_df = pd.merge(street_df, weather_df, how='left', on='Date')

In [8]:
print(merged_df.shape)
merged_df.head()

(52382, 31)


Unnamed: 0,ID.1,Block,IUCR,Primary Type,Violence Status,Description,Location Description,Arrest,Domestic,District,...,Minimum_Temperature,Fog_Ice_Freezing_Fog,Heavy_Fog_or_Heavy_Freezing_Fog,Thunder,Hail,Smoke_or_Haze,Mist,Rain,Freezing_Rain,Snow_Snow_Pellets_or_Ice Crystals
0,12260669,057XX S RACINE AVE,2022,NARCOTICS,NON-VIOLENT,POSSESS - COCAINE,STREET,True,False,7.0,...,26.0,1.0,,,,,,,,
1,25702,068XX S STONY ISLAND AVE,0110,HOMICIDE,VIOLENT,FIRST DEGREE MURDER,STREET,False,False,3.0,...,26.0,1.0,,,,,,,,
2,12260693,016XX E 69TH ST,141A,WEAPONS VIOLATION,NON-VIOLENT,UNLAWFUL USE - HANDGUN,STREET,False,False,3.0,...,26.0,1.0,,,,,,,,
3,12262250,055XX S Blackstone Ave,1360,CRIMINAL TRESPASS,NON-VIOLENT,TO VEHICLE,STREET,False,False,2.0,...,26.0,1.0,,,,,,,,
4,12261308,111XX S ASHLAND AVE,0820,THEFT,NON-VIOLENT,$500 AND UNDER,STREET,False,False,22.0,...,26.0,1.0,,,,,,,,


In [9]:
merged_df.dtypes

ID.1                                   int64
Block                                 object
IUCR                                  object
Primary Type                          object
Violence Status                       object
Description                           object
Location Description                  object
Arrest                                  bool
Domestic                                bool
District                             float64
Ward                                 float64
Community Area                       float64
FBI Code                              object
Year                                   int64
Latitude                             float64
Longitude                            float64
Date                                  object
Time                                  object
Average_Wind_Speed                   float64
Average_Temperature                  float64
Maximum_Temperature                  float64
Minimum_Temperature                  float64
Fog_Ice_Fr

In [10]:
df = merged_df[['Violence Status', 'Community Area', 'Maximum_Temperature']]
print(df.shape)
df.head()

(52382, 3)


Unnamed: 0,Violence Status,Community Area,Maximum_Temperature
0,NON-VIOLENT,67.0,33.0
1,VIOLENT,43.0,33.0
2,NON-VIOLENT,43.0,33.0
3,NON-VIOLENT,41.0,33.0
4,NON-VIOLENT,75.0,33.0


In [11]:
df = df.replace(np.nan,0)
df.head()

Unnamed: 0,Violence Status,Community Area,Maximum_Temperature
0,NON-VIOLENT,67.0,33.0
1,VIOLENT,43.0,33.0
2,NON-VIOLENT,43.0,33.0
3,NON-VIOLENT,41.0,33.0
4,NON-VIOLENT,75.0,33.0


In [12]:
df.columns

Index(['Violence Status', 'Community Area', 'Maximum_Temperature'], dtype='object')

In [13]:
df.nunique()

Violence Status         2
Community Area         77
Maximum_Temperature    81
dtype: int64

In [14]:
y = df['Violence Status']
X = df.drop(columns='Violence Status')
X = pd.get_dummies(X)

In [15]:
print(X.shape)
print(y.shape)
X.head()


(52382, 2)
(52382,)


Unnamed: 0,Community Area,Maximum_Temperature
0,67.0,33.0
1,43.0,33.0
2,43.0,33.0
3,41.0,33.0
4,75.0,33.0


In [16]:
X.describe()

Unnamed: 0,Community Area,Maximum_Temperature
count,52382.0,52382.0
mean,37.714902,63.752071
std,21.005551,20.673494
min,1.0,0.0
25%,24.0,47.0
50%,32.0,67.0
75%,56.0,82.0
max,77.0,95.0


In [17]:
y.value_counts()

NON-VIOLENT    38149
VIOLENT        14233
Name: Violence Status, dtype: int64

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42)

In [19]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [22]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [23]:
predictions

array(['NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', ..., 'NON-VIOLENT',
       'NON-VIOLENT', 'NON-VIOLENT'], dtype=object)

In [24]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Non-voilent", "Actual Violent"], columns=["Predicted Non-violent", "Predicted Violent"])

cm_df

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Non-violent,Predicted Violent
Actual Non-voilent,8967,571
Actual Violent,3310,248


Accuracy Score : 0.7036499694563225
Classification Report
              precision    recall  f1-score   support

 NON-VIOLENT       0.73      0.94      0.82      9538
     VIOLENT       0.30      0.07      0.11      3558

    accuracy                           0.70     13096
   macro avg       0.52      0.50      0.47     13096
weighted avg       0.61      0.70      0.63     13096



In [26]:
for feat, importance in sorted(zip(X.columns, rf_model.feature_importances_), key = lambda x: x[1], reverse=True):
    print(f'{feat}: ({importance})')

Community Area: (0.5082498013984034)
Maximum_Temperature: (0.49175019860159663)
