In [1]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC

In [2]:
# Loading data
file_path = Path("Resources/sample_crime_data.csv")
df = pd.read_csv(file_path)
df = df.set_index("ID")
print(df.shape)
df.head()

(206997, 18)


Unnamed: 0_level_0,ID.1,Block,IUCR,Primary Type,Violence Status,Description,Location Description,Arrest,Domestic,District,Ward,Community Area,FBI Code,Year,Latitude,Longitude,Date,Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6909918,12260346,070XX S EGGLESTON AVE,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,7.0,6.0,68.0,08B,2021,41.766435,-87.635964,2021-01-03,13:23:00
6927718,12263464,080XX S YALE AVE,820,THEFT,NON-VIOLENT,$500 AND UNDER,RESIDENCE,False,False,6.0,17.0,44.0,06,2021,41.748474,-87.630607,2021-01-03,06:59:00
6927807,12259990,056XX W WASHINGTON BLVD,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,15.0,29.0,25.0,08B,2021,41.882224,-87.766076,2021-01-03,00:20:00
6931849,12260669,057XX S RACINE AVE,2022,NARCOTICS,NON-VIOLENT,POSSESS - COCAINE,STREET,True,False,7.0,16.0,67.0,18,2021,41.790069,-87.654769,2021-01-03,20:47:00
6931854,25702,068XX S STONY ISLAND AVE,110,HOMICIDE,VIOLENT,FIRST DEGREE MURDER,STREET,False,False,3.0,5.0,43.0,01A,2021,41.771062,-87.586271,2021-01-03,20:09:00


In [3]:
df.dtypes

ID.1                      int64
Block                    object
IUCR                     object
Primary Type             object
Violence Status          object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
District                float64
Ward                    float64
Community Area          float64
FBI Code                 object
Year                      int64
Latitude                float64
Longitude               float64
Date                     object
Time                     object
dtype: object

In [4]:
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()


# Check the number of unique values in each column
df[df_cat].nunique()

Block                   26955
IUCR                      300
Primary Type               31
Violence Status             2
Description               278
Location Description      126
FBI Code                   26
Date                      370
Time                     1450
dtype: int64

In [5]:
df['Violence Status'].value_counts()

NON-VIOLENT    135435
VIOLENT         71562
Name: Violence Status, dtype: int64

In [6]:
df = df[['Date','Violence Status']]
df.head()

Unnamed: 0_level_0,Date,Violence Status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
6909918,2021-01-03,VIOLENT
6927718,2021-01-03,NON-VIOLENT
6927807,2021-01-03,VIOLENT
6931849,2021-01-03,NON-VIOLENT
6931854,2021-01-03,VIOLENT


In [7]:
# create sample weather data
import numpy as np

Chicago_Weather_df = pd.DataFrame({'Temp':np.random.uniform(-10,70,206997),'Date':df.Date}).set_index('Date').reset_index()

In [8]:
df = df.set_index('Date').reset_index()

In [9]:
merged_df = df.merge(Chicago_Weather_df,on='Date').drop(columns='Date')

In [10]:
merged_df.head()

Unnamed: 0,Violence Status,Temp
0,VIOLENT,38.202938
1,VIOLENT,59.132091
2,VIOLENT,49.818465
3,VIOLENT,-3.623086
4,VIOLENT,30.943212


In [15]:
sample_df = merged_df.sample(10000).reset_index()
sample_df = sample_df.drop(columns='index')
sample_df

Unnamed: 0,Violence Status,Temp
0,NON-VIOLENT,44.425220
1,VIOLENT,-8.674899
2,VIOLENT,22.898026
3,VIOLENT,27.636420
4,NON-VIOLENT,27.661422
...,...,...
9995,VIOLENT,36.494386
9996,NON-VIOLENT,64.575210
9997,NON-VIOLENT,34.729582
9998,NON-VIOLENT,68.479947


In [25]:
y = sample_df['Violence Status']
X = sample_df[['Temp']]

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42)

In [18]:
#merged_df.plot.scatter('Temp','Violence Status')

In [27]:
# Create a logistic regression model, fit (train) the model, and validate the model.
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6488
Testing Data Score: 0.6568


In [28]:
# Create a confusion matrix and print out the confusion matrix for the predicted outcome.
y_true = y_test
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Violent", "Actual Non-Violent"], columns=["Predicted Violent", "Predicted Non-Violent"])

cm_df

Unnamed: 0,Predicted Violent,Predicted Non-Violent
Actual Violent,1642,0
Actual Non-Violent,858,0


In [21]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

 NON-VIOLENT       0.66      1.00      0.79      1642
     VIOLENT       0.00      0.00      0.00       858

    accuracy                           0.66      2500
   macro avg       0.33      0.50      0.40      2500
weighted avg       0.43      0.66      0.52      2500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
