# Project File


In [38]:
# Importing common libraries 
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning)


In [39]:
data = pd.read_csv("data\\crime.csv", encoding='latin1')



In [40]:
subset_size = 500

In [41]:
data = data.sample(n=subset_size, random_state=42)

In [42]:
data.head()


Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
35710,I182032558,3803,Motor Vehicle Accident Response,M/V ACCIDENT - PERSONAL INJURY,B2,179,,2018-05-01 08:52:00,2018,5,Tuesday,8,Part Three,GERARD ST,42.330628,-71.072107,"(42.33062755, -71.07210653)"
270509,I152102923,614,Larceny From Motor Vehicle,LARCENY THEFT FROM MV - NON-ACCESSORY,D4,170,,2015-12-13 17:07:00,2015,12,Sunday,17,Part One,MASSACHUSETTS AVE,42.335621,-71.075824,"(42.33562126, -71.07582361)"
74362,I172098993,1402,Vandalism,VANDALISM,C11,357,,2017-11-28 22:49:00,2017,11,Tuesday,22,Part Two,WESTVILLE ST,42.300106,-71.071027,"(42.30010597, -71.07102710)"
968,I182069871,802,Simple Assault,ASSAULT SIMPLE - BATTERY,B2,319,,2018-08-31 08:22:06,2018,8,Friday,8,Part Two,SEAVER ST,42.30604,-71.086212,"(42.30603974, -71.08621240)"
318360,I152049968,3301,Verbal Disputes,VERBAL DISPUTE,D14,939,,2015-06-16 17:25:00,2015,6,Tuesday,17,Part Three,FANEUIL ST,42.355553,-71.152747,"(42.35555336, -71.15274721)"


In [43]:
data.drop(columns=['SHOOTING'], inplace=True)
data.dropna(inplace=True)

In [44]:
label_encoder = LabelEncoder()
data['DAYS_ENCODED'] = label_encoder.fit_transform(data['DAY_OF_WEEK'])
data['DISTRICT_ENCODED'] = label_encoder.fit_transform(data['DISTRICT'])
data['OFFENSE_CODE_GROUP_ENCODED'] = label_encoder.fit_transform(data['OFFENSE_CODE_GROUP'])
data['REPORTING_AREA_ENCODED'] = label_encoder.fit_transform(data['REPORTING_AREA'])
data['UCR_PART_ENCODED'] = label_encoder.fit_transform(data['UCR_PART'])
data['LOCATION_ENCODED'] = label_encoder.fit_transform(data['Location'])
data['STREET_ENCODED'] = label_encoder.fit_transform(data['STREET'])

y = data['DISTRICT_ENCODED']


In [45]:
columns_to_drop = ['INCIDENT_NUMBER', 'OFFENSE_CODE_GROUP', 'DISTRICT', 'OFFENSE_DESCRIPTION','REPORTING_AREA', 'OCCURRED_ON_DATE',
                  'DAY_OF_WEEK','UCR_PART','STREET_ENCODED','STREET','Location','DISTRICT_ENCODED']
X=data.drop(columns_to_drop, axis=1, inplace=True)
data.head()

Unnamed: 0,OFFENSE_CODE,YEAR,MONTH,HOUR,Lat,Long,DAYS_ENCODED,OFFENSE_CODE_GROUP_ENCODED,REPORTING_AREA_ENCODED,UCR_PART_ENCODED,LOCATION_ENCODED
35710,3803,2018,5,8,42.330628,-71.072107,5,24,34,2,266
270509,614,2015,12,17,42.335621,-71.075824,3,17,31,1,289
74362,1402,2017,11,22,42.300106,-71.071027,5,39,122,3,117
968,802,2018,8,8,42.30604,-71.086212,0,37,94,3,141
318360,3301,2015,6,17,42.355553,-71.152747,5,40,296,2,369


In [46]:
X = data

In [47]:
y

35710     3
270509    8
74362     5
968       3
318360    7
         ..
205204    6
192624    0
270980    3
63591     6
233748    0
Name: DISTRICT_ENCODED, Length: 463, dtype: int32

In [49]:
from sklearn.neighbors import KNeighborsClassifier
#Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}


# Create a KNN classifier
knn_classifier = KNeighborsClassifier()

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best estimator to make predictions
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Accuracy: 0.6989247311827957


In [51]:
from sklearn.metrics import classification_report

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.50      0.75      0.60         4
           1       0.00      0.00      0.00         3
           2       1.00      1.00      1.00         2
           3       0.67      0.82      0.74        17
           4       0.67      0.80      0.73        15
           5       0.62      0.71      0.67        14
           6       0.75      0.75      0.75         4
           7       1.00      0.62      0.77         8
           8       0.64      0.75      0.69        12
           9       1.00      0.50      0.67         8
          10       1.00      0.25      0.40         4
          11       1.00      1.00      1.00         2

    accuracy                           0.70        93
   macro avg       0.74      0.66      0.67        93
weighted avg       0.72      0.70      0.68        93



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
