In [85]:
# Import dependencies
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings

In [86]:
df = pd.read_csv('crime.csv')
df.head()

Unnamed: 0,ID,Primary_Type,Violence_Status,Description,Location_Description,Arrest,Community_Area,Latitude,Longitude,Date,Time,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Thunder,Smoke_or_Haze,Community_Name
0,11698035,ASSAULT,VIOLENT,SIMPLE,APARTMENT,f,44,41.747196,-87.602638,1/1/19,8:00:00,7.61,31,1.0,,,Chatham
1,11884966,OFFENSE INVOLVING CHILDREN,NON-VIOLENT,CHILD PORNOGRAPHY,RESIDENCE,f,55,41.683664,-87.532842,1/1/19,0:01:00,7.61,31,1.0,,,Hegewisch
2,11878985,THEFT,NON-VIOLENT,OVER $500,OTHER,f,18,41.923613,-87.793512,1/1/19,0:00:00,7.61,31,1.0,,,Montclare
3,11682859,OFFENSE INVOLVING CHILDREN,NON-VIOLENT,SEX ASSLT OF CHILD BY FAM MBR,RESIDENCE,f,69,41.759333,-87.633307,1/1/19,0:00:00,7.61,31,1.0,,,Greater Grand Crossing
4,11739161,OFFENSE INVOLVING CHILDREN,NON-VIOLENT,AGG CRIM SEX ABUSE FAM MEMBER,APARTMENT,f,54,41.656345,-87.60513,1/1/19,0:00:00,7.61,31,1.0,,,Riverdale


In [87]:
# Set up dataframe with only the features we are pridicting
df = df[['Date', 'Violence_Status', 'Community_Area', 'Average_Temperature']]
df = df.replace(np.nan,0)
df.head()

Unnamed: 0,Date,Violence_Status,Community_Area,Average_Temperature
0,1/1/19,VIOLENT,44,31
1,1/1/19,NON-VIOLENT,55,31
2,1/1/19,NON-VIOLENT,18,31
3,1/1/19,NON-VIOLENT,69,31
4,1/1/19,NON-VIOLENT,54,31


In [88]:
# correct column format


print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668177 entries, 0 to 668176
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Date                 668177 non-null  object
 1   Violence_Status      668177 non-null  object
 2   Community_Area       668177 non-null  int64 
 3   Average_Temperature  668177 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 20.4+ MB
None


In [89]:
df.isnull().values.any()

False

In [90]:
# correct date format

df['Date'] = pd.to_datetime(df['Date']).dt.date

In [91]:
# correct date format

df['Date'] =  pd.to_datetime(df['Date'], format="%Y-%m-%d")

In [92]:
# correct date format

df['Day of year'] =  pd.to_datetime(df['Date'], format="%m-%d")

In [93]:
df

Unnamed: 0,Date,Violence_Status,Community_Area,Average_Temperature,Day of year
0,2019-01-01,VIOLENT,44,31,2019-01-01
1,2019-01-01,NON-VIOLENT,55,31,2019-01-01
2,2019-01-01,NON-VIOLENT,18,31,2019-01-01
3,2019-01-01,NON-VIOLENT,69,31,2019-01-01
4,2019-01-01,NON-VIOLENT,54,31,2019-01-01
...,...,...,...,...,...
668172,2021-12-31,NON-VIOLENT,8,37,2021-12-31
668173,2021-12-31,VIOLENT,47,37,2021-12-31
668174,2021-12-31,NON-VIOLENT,34,37,2021-12-31
668175,2021-12-31,NON-VIOLENT,35,37,2021-12-31


In [94]:
def wrangle(X):
    X = X.copy()

    # Create 'year' feature
    X['Year'] = X['Date'].dt.year

    # Create 'month' feature
    X['Month'] = X['Date'].dt.month

    # Create 'day' feature
    X['Day'] = X['Date'].dt.day

    # Create 'day of week' feature
    X['Weekday'] = X['Date'].dt.weekday
    return X
df=wrangle(df)

In [95]:
crimes_per_day=df.groupby(["Date", "Community_Area"]).size().reset_index(name="Crimes_per_day")
crimes_per_day

Unnamed: 0,Date,Community_Area,Crimes_per_day
0,2019-01-01,1,18
1,2019-01-01,2,9
2,2019-01-01,3,14
3,2019-01-01,4,5
4,2019-01-01,5,4
...,...,...,...
79691,2021-12-31,73,9
79692,2021-12-31,74,2
79693,2021-12-31,75,7
79694,2021-12-31,76,2


In [101]:
df=pd.merge(df, crimes_per_day, on=['Date', 'Community_Area'])

In [102]:
df

Unnamed: 0,Date,Violence_Status,Community_Area,Average_Temperature,Day of year,Year,Month,Day,Weekday,Crimes_per_day
0,2019-01-01,VIOLENT,44,31,2019-01-01,2019,1,1,1,31
1,2019-01-01,NON-VIOLENT,44,31,2019-01-01,2019,1,1,1,31
2,2019-01-01,VIOLENT,44,31,2019-01-01,2019,1,1,1,31
3,2019-01-01,VIOLENT,44,31,2019-01-01,2019,1,1,1,31
4,2019-01-01,NON-VIOLENT,44,31,2019-01-01,2019,1,1,1,31
...,...,...,...,...,...,...,...,...,...,...
668172,2021-12-31,NON-VIOLENT,17,37,2021-12-31,2021,12,31,4,6
668173,2021-12-31,NON-VIOLENT,17,37,2021-12-31,2021,12,31,4,6
668174,2021-12-31,NON-VIOLENT,17,37,2021-12-31,2021,12,31,4,6
668175,2021-12-31,NON-VIOLENT,36,37,2021-12-31,2021,12,31,4,1


In [104]:
df = df[['Date','Community_Area', 'Average_Temperature',
         'Year', 'Month', 'Day', 'Weekday', 'Crimes_per_day']]
df = df.replace(np.nan,0)
df

Unnamed: 0,Date,Community_Area,Average_Temperature,Year,Month,Day,Weekday,Crimes_per_day
0,2019-01-01,44,31,2019,1,1,1,31
1,2019-01-01,44,31,2019,1,1,1,31
2,2019-01-01,44,31,2019,1,1,1,31
3,2019-01-01,44,31,2019,1,1,1,31
4,2019-01-01,44,31,2019,1,1,1,31
...,...,...,...,...,...,...,...,...
668172,2021-12-31,17,37,2021,12,31,4,6
668173,2021-12-31,17,37,2021,12,31,4,6
668174,2021-12-31,17,37,2021,12,31,4,6
668175,2021-12-31,36,37,2021,12,31,4,1


In [61]:
# df = crimes_per_day.merge(df,on='Date')

In [105]:
# df

In [106]:
features = df.columns.tolist()

In [107]:
features.remove('Crimes_per_day')

In [108]:
features.remove('Year')

In [110]:
features.remove('Date')

In [111]:
features

['Community_Area', 'Average_Temperature', 'Month', 'Day', 'Weekday']

In [112]:
# # split data for training and testing
# X = df
# X = pd.get_dummies(X)

# y = df
# y = pd.get_dummies(Y)
# #Split the data into training and testing
# X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42)

# X_train.shape

In [113]:
train = df[df['Year'] < 2020]
test = df[df['Year'] >= 2020]
train.shape, test.shape

((258152, 8), (410025, 8))

In [114]:

#Set target

target='Crimes_per_day'

#create dataframes for model (w/ features)

X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [115]:
X_train

Unnamed: 0,Community_Area,Average_Temperature,Month,Day,Weekday
0,44,31,1,1,1
1,44,31,1,1,1
2,44,31,1,1,1
3,44,31,1,1,1
4,44,31,1,1,1
...,...,...,...,...,...
258147,37,26,12,31,1
258148,37,26,12,31,1
258149,13,26,12,31,1
258150,64,26,12,31,1


In [116]:
y_test

258152    15
258153    15
258154    15
258155    15
258156    15
          ..
668172     6
668173     6
668174     6
668175     1
668176     1
Name: Crimes_per_day, Length: 410025, dtype: int64

In [117]:
features

['Community_Area', 'Average_Temperature', 'Month', 'Day', 'Weekday']

In [118]:
guess = df['Crimes_per_day'].mean()
errors = guess - df['Crimes_per_day']
mean_absolute_error = errors.abs().mean()
print(f'If we just guessed every community had {guess:}, crimes per day')
print(f'we would be off by {mean_absolute_error:} crimes on average.')

If we just guessed every community had 15.55104261296034, crimes per day
we would be off by 8.39106275274902 crimes on average.


In [119]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: -0.78 %.


In [120]:
X = X_train
y = y_train

In [121]:
from sklearn.pipeline import make_pipeline
from sklearn import datasets, svm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [122]:
# create random forest regressor pipeline


pipeline = make_pipeline(
    RandomForestRegressor(n_estimators = 100,
                          max_depth = 15,
                          random_state = 42, n_jobs=-1)
)
pipeline.fit(X,y)



Pipeline(steps=[('randomforestregressor',
                 RandomForestRegressor(max_depth=15, n_jobs=-1,
                                       random_state=42))])

In [123]:
# create predict function for testing and deployment

def predict (Community_Area,
             Average_Temperature,
             Month,
             Day,
             Weekday):
    df9=pd.DataFrame(
        columns = ['Community_Area',
                   'Average_Temperature',
                   'Month',
                   'Day',
                   'Weekday'],
                    data =[[Community_Area,
                            Average_Temperature,
                            Month,
                            Day,
                            Weekday]]
    )
    df9pred = pipeline.predict(df9)[0]
    return f'{df9pred:.0f}'

In [124]:
# Use the forest's predict method on the test data
predictions = pipeline.predict(X)
# Calculate the absolute errors
errors = abs(predictions - y)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'crimes.')

Mean Absolute Error: 1.27842 crimes.


In [125]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 5), '%.')

Accuracy: 86.87068 %.


In [126]:
errors.describe()

count    258152.000000
mean          1.278421
std           1.474929
min           0.000000
25%           0.240825
50%           0.775276
75%           1.806127
max          19.153950
Name: Crimes_per_day, dtype: float64

In [127]:
# test model for deployment

predict (1,14,3,0,0)

'9'

In [128]:
# export model for deployment

from joblib import dump
dump(pipeline, 'model2.joblib', compress=True)

['model2.joblib']