In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings

In [2]:
df = pd.read_csv('data/crime.csv')
df.head()

Unnamed: 0,ID,Primary_Type,Violence_Status,Description,Location_Description,Arrest,Community_Area,Latitude,Longitude,Date,Time,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Thunder,Smoke_or_Haze,Community_Name
0,11698035,ASSAULT,VIOLENT,SIMPLE,APARTMENT,f,44,41.747196,-87.602638,1/1/19,8:00:00,7.61,31,1.0,,,Chatham
1,11884966,OFFENSE INVOLVING CHILDREN,NON-VIOLENT,CHILD PORNOGRAPHY,RESIDENCE,f,55,41.683664,-87.532842,1/1/19,0:01:00,7.61,31,1.0,,,Hegewisch
2,11878985,THEFT,NON-VIOLENT,OVER $500,OTHER,f,18,41.923613,-87.793512,1/1/19,0:00:00,7.61,31,1.0,,,Montclare
3,11682859,OFFENSE INVOLVING CHILDREN,NON-VIOLENT,SEX ASSLT OF CHILD BY FAM MBR,RESIDENCE,f,69,41.759333,-87.633307,1/1/19,0:00:00,7.61,31,1.0,,,Greater Grand Crossing
4,11739161,OFFENSE INVOLVING CHILDREN,NON-VIOLENT,AGG CRIM SEX ABUSE FAM MEMBER,APARTMENT,f,54,41.656345,-87.60513,1/1/19,0:00:00,7.61,31,1.0,,,Riverdale


In [3]:
# Set up dataframe with only the features we are pridicting
df = df[['Date', 'Violence_Status', 'Community_Area', 'Average_Wind_Speed', 'Average_Temperature', 'Fog_Ice_Freezing_Fog',
         'Smoke_or_Haze']]
df = df.replace(np.nan,0)
df.head()

Unnamed: 0,Date,Violence_Status,Community_Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze
0,1/1/19,VIOLENT,44,7.61,31,1.0,0.0
1,1/1/19,NON-VIOLENT,55,7.61,31,1.0,0.0
2,1/1/19,NON-VIOLENT,18,7.61,31,1.0,0.0
3,1/1/19,NON-VIOLENT,69,7.61,31,1.0,0.0
4,1/1/19,NON-VIOLENT,54,7.61,31,1.0,0.0


In [4]:
# correct column format

# df['DailyWeather'] = df['DailyWeather'].astype(str)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668177 entries, 0 to 668176
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Date                  668177 non-null  object 
 1   Violence_Status       668177 non-null  object 
 2   Community_Area        668177 non-null  int64  
 3   Average_Wind_Speed    668177 non-null  float64
 4   Average_Temperature   668177 non-null  int64  
 5   Fog_Ice_Freezing_Fog  668177 non-null  float64
 6   Smoke_or_Haze         668177 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 35.7+ MB
None


In [5]:
# correct date format

df['Date'] = pd.to_datetime(df['Date']).dt.date

In [6]:
# correct date format

df['Date'] =  pd.to_datetime(df['Date'], format="%Y-%m-%d")

In [7]:
# correct date format

df['Day of year'] =  pd.to_datetime(df['Date'], format="%m-%d")

In [8]:
df.head()

Unnamed: 0,Date,Violence_Status,Community_Area,Average_Wind_Speed,Average_Temperature,Fog_Ice_Freezing_Fog,Smoke_or_Haze,Day of year
0,2019-01-01,VIOLENT,44,7.61,31,1.0,0.0,2019-01-01
1,2019-01-01,NON-VIOLENT,55,7.61,31,1.0,0.0,2019-01-01
2,2019-01-01,NON-VIOLENT,18,7.61,31,1.0,0.0,2019-01-01
3,2019-01-01,NON-VIOLENT,69,7.61,31,1.0,0.0,2019-01-01
4,2019-01-01,NON-VIOLENT,54,7.61,31,1.0,0.0,2019-01-01


In [9]:
def wrangle(X):
    X = X.copy()

    # Create 'year' feature
    X['Year'] = X['Date'].dt.year

    # Create 'month' feature
    X['Month'] = X['Date'].dt.month

    # Create 'day' feature
    X['Day'] = X['Date'].dt.day

    # Create 'day of week' feature
    X['Weekday'] = X['Date'].dt.weekday
    return X
df=wrangle(df)

In [10]:
crimes_per_day=df.groupby(["Date"]).size().reset_index(name="Crimes_per_day")
crimes_per_day

Unnamed: 0,Date,Crimes_per_day
0,2019-01-01,1005
1,2019-01-02,645
2,2019-01-03,748
3,2019-01-04,759
4,2019-01-05,789
...,...,...
1091,2021-12-27,446
1092,2021-12-28,403
1093,2021-12-29,451
1094,2021-12-30,482


In [11]:
df = crimes_per_day.merge(df,on='Date')

In [12]:
df = df[['Crimes_per_day', 'Community_Area', 'Average_Temperature',
         'Year', 'Month', 'Day', 'Weekday']]
df = df.replace(np.nan,0)
df.head()

Unnamed: 0,Crimes_per_day,Community_Area,Average_Temperature,Year,Month,Day,Weekday
0,1005,44,31,2019,1,1,1
1,1005,55,31,2019,1,1,1
2,1005,18,31,2019,1,1,1
3,1005,69,31,2019,1,1,1
4,1005,54,31,2019,1,1,1


In [13]:
df.dtypes

Crimes_per_day         int64
Community_Area         int64
Average_Temperature    int64
Year                   int64
Month                  int64
Day                    int64
Weekday                int64
dtype: object

In [19]:
features = df.columns.tolist()

In [38]:
features.remove('Crimes_per_day')

In [40]:
features.remove('Year')

In [41]:
features

['Community_Area', 'Average_Temperature', 'Month', 'Day', 'Weekday']

In [98]:
# # split data for training and testing
# X = df
# X = pd.get_dummies(X)

# y = df
# y = pd.get_dummies(Y)
# #Split the data into training and testing
# X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42)

# X_train.shape

(501132, 7)

In [42]:
train = df[df['Year'] < 2020]
test = df[df['Year'] >= 2020]
train.shape, test.shape

((258152, 7), (410025, 7))

In [43]:

#Set target

target='Crimes_per_day'

#create dataframes for model (w/ features)

X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [44]:
X_train

Unnamed: 0,Community_Area,Average_Temperature,Month,Day,Weekday
0,44,31,1,1,1
1,55,31,1,1,1
2,18,31,1,1,1
3,69,31,1,1,1
4,54,31,1,1,1
...,...,...,...,...,...
258147,66,26,12,31,1
258148,51,26,12,31,1
258149,34,26,12,31,1
258150,26,26,12,31,1


In [45]:
y_test

258152    865
258153    865
258154    865
258155    865
258156    865
         ... 
668172    523
668173    523
668174    523
668175    523
668176    523
Name: Crimes_per_day, Length: 410025, dtype: int64

In [46]:
features

['Community_Area', 'Average_Temperature', 'Month', 'Day', 'Weekday']

In [47]:
guess = df['Crimes_per_day'].mean()
errors = guess - df['Crimes_per_day']
mean_absolute_error = errors.abs().mean()
print(f'If we just guessed every community had {guess:}, crimes per day')
print(f'we would be off by {mean_absolute_error:} crimes on average.')

If we just guessed every community had 630.9038338643802, crimes per day
we would be off by 89.26093000317402 crimes on average.


In [48]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 87.52 %.


In [49]:
X = X_train
y = y_train

In [50]:
from sklearn.pipeline import make_pipeline
from sklearn import datasets, svm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [51]:
# create random forest regressor pipeline


pipeline = make_pipeline(
    RandomForestRegressor(n_estimators = 100,
                          max_depth = 15,
                          random_state = 42, n_jobs=-1)
)
pipeline.fit(X,y)



Pipeline(steps=[('randomforestregressor',
                 RandomForestRegressor(max_depth=15, n_jobs=-1,
                                       random_state=42))])

In [52]:
# create predict function for testing and deployment

def predict (Community_Area,
             Average_Temperature,
             Month,
             Day,
             Weekday):
    df9=pd.DataFrame(
        columns = ['Community_Area',
                   'Average_Temperature',
                   'Month',
                   'Day',
                   'Weekday'],
                    data =[[Community_Area,
                            Average_Temperature,
                            Month,
                            Day,
                            Weekday]]
    )
    df9pred = pipeline.predict(df9)[0]
    return f'{df9pred:.0f}'

In [53]:
# Use the forest's predict method on the test data
predictions = pipeline.predict(X)
# Calculate the absolute errors
errors = abs(predictions - y)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'crimes.')

Mean Absolute Error: 0.26738 crimes.


In [54]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 5), '%.')

Accuracy: 99.96215 %.


In [55]:
errors.describe()

count    258152.000000
mean          0.267384
std           1.142198
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          10.000112
Name: Crimes_per_day, dtype: float64

In [56]:
# test model for deployment

predict (1,14,3,0,0)

'674'

In [57]:
# export model for deployment

from joblib import dump
dump(pipeline, 'feature_model.joblib', compress=True)

['feature_model.joblib']

In [121]:
# # correct column format

# # df['DailyWeather'] = df['DailyWeather'].astype(str)
# print(df.info())

In [122]:
# # Create our features
# X = df.drop(columns='Violence_Status')
# X = pd.get_dummies(X)
# # Create our target
# y = df2['Violence_Status']

In [123]:
# print(X.shape)
# print(y.shape)
# X.head()

In [124]:
# #Split the data into training and testing
# X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42)

# X_train.shape


In [125]:
# # Creating StandardScaler instance
# scaler = StandardScaler()
# # Fitting Standard Scaller
# X_scaler = scaler.fit(X_train)
# # Scaling data
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [126]:
# # Create a random forest classifier.
# rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [127]:
# # Fitting the model
# rf_model = rf_model.fit(X_train_scaled, y_train)

In [128]:
# # Making predictions using the testing data.
# predictions = rf_model.predict(X_test_scaled)

In [129]:
# predictions

In [130]:
# # Calculating the confusion matrix.
# cm = confusion_matrix(y_test, predictions)

# # Create a DataFrame from the confusion matrix.
# cm_df = pd.DataFrame(
#     cm, index=["Actual Non-voilent", "Actual Violent"], columns=["Predicted Non-violent", "Predicted Violent"])

# cm_df

# # Calculating the accuracy score
# acc_score = accuracy_score(y_test, predictions)

In [131]:
# # Displaying results
# print("Confusion Matrix")
# display(cm_df)
# print(f"Accuracy Score : {acc_score}")
# print("Classification Report")
# print(classification_report(y_test, predictions))

In [132]:
# # Displaying the feature importances
# feature_importances = rf_model.feature_importances_
# %matplotlib inline
# from matplotlib import pyplot as plt
# features = sorted(zip(X.columns, rf_model.feature_importances_), key = lambda x: x[1])
# cols = [f[0] for f in features]
# width = [f[1] for f in features]

# fig, ax = plt.subplots()

# fig.set_size_inches(10,5)
# plt.margins(y=0.001)

# ax.barh(y=cols, width=width)

# plt.show()

In [133]:
# # Test prediction 
# rf_model.predict([[1, 0, 80, 0, 0, 4, 21, 3, 0]])

In [134]:
# from sklearn.pipeline import make_pipeline
# from sklearn import datasets, svm
# from sklearn.svm import SVC
# from sklearn.pipeline import Pipeline

In [135]:
# # convert string to integer

# df['Community_Area'] = df['Community_Area'].apply(np.int64)

In [136]:
# def wrangle(X):
#     X = X.copy()

#     # Create 'year' feature
#     X['Year'] = X['DATE'].dt.year

#     # Create 'month' feature
#     X['Month'] = X['DATE'].dt.month

#     # Create 'day' feature
#     X['Day'] = X['DATE'].dt.day

#     # Create 'day of week' feature
#     X['Weekday'] = X['DATE'].dt.weekday
#     return X
# train=wrangle(df)
# train.head()

In [137]:
# df.head()

In [138]:
# df = df[['Year', 'Month', 'Day', 'Weekday', 'Community_Area', 'Average_Wind_Speed', 'Average_Temperature', 'Fog_Ice_Freezing_Fog',
#          'Smoke_or_Haze']]
# df.head()

In [139]:
# # create random forest class pipeline


# # pipeline = make_pipeline(
# #     RandomForestClassifier(n_estimators = 100,
# #                           max_depth = 15,
# #                           random_state = 42, n_jobs=-1)
# # )
# # pipeline.fit(X,y)

# pipeline = Pipeline([('feature_selection', SelectKBest(chi2, k=2)), ('classification', RandomForestClassifier()) ])
# print(type(pipeline))

In [140]:
# # create predict function for testing and deployment

# def predict (Community_Area,
#              Average_Temperature,
#              Month,
#              Day,
#              Weekday):
#     df9=pd.DataFrame(
#         columns = ['Community_Area',
#                    'Average_Temperature',
#                    'Month',
#                    'Day',
#                    'Weekday'],
#                     data =[[Community_Area,
#                             Average_Temperature,
#                             Month,
#                             Day,
#                             Weekday]]
#     )
#     df9pred = pipeline.predict(df9)[0]
#     return f'{df9pred:.0f}'

In [141]:
# # Use the forest's predict method on the test data
# predictions = pipeline.predict(X)
# # Calculate the absolute errors
# errors = abs(predictions - y)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 5), 'crimes.')

In [142]:
# # export model for deployment

# from joblib import dump
# dump(pipeline, 'pipeline3.joblib', compress=True)

In [143]:
# # Use the forest's predict method on the test data
# predictions = pipeline.predict(X)
# # Calculate the absolute errors
# errors = abs(predictions - y)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 5), 'crimes.')

In [144]:
# # Calculate mean absolute percentage error (MAPE)
# mape = 100 * (errors / y)
# # Calculate and display accuracy
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', round(accuracy, 5), '%.')