In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

In [107]:
#https://en.wikipedia.org/wiki/Levenberg–Marquardt_algorithm
#Levenberg–Marquardt algorithm

In [108]:
data_path = "../data/chicago_data.csv"
data_file = pd.read_csv(data_path)

regulation = 10000 #prevent overfitting
data_file = data_file[:regulation]

data_file.dropna(axis=0, inplace=True)
data_file.drop_duplicates(inplace=True)
data_file.drop(columns=['ID', 'Case Number'], inplace=True)

In [109]:
# Feature extraction
data_file["Date"] = pd.to_datetime(data_file["Date"])
data_file["DayOfTheWeek"] = data_file["Date"].dt.dayofweek
data_file['Month'] = data_file['Date'].dt.month
data_file['Day'] = data_file['Date'].dt.day  
data_file['Hour'] = data_file['Date'].dt.hour  
data_file.drop(columns=['Date'], inplace=True)  
data_file.head()

Unnamed: 0,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,...,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,DayOfTheWeek,Month,Day,Hour
11,035XX S INDIANA AVE,0820,THEFT,$500 AND UNDER,APARTMENT,False,False,212,2.0,3.0,...,1881621.0,2020,05/14/2020 08:47:15 AM,41.830482,-87.621752,"(41.830481843, -87.621751752)",3,5,7,10
12,005XX W 32ND ST,0460,BATTERY,SIMPLE,APARTMENT,True,False,915,9.0,11.0,...,1883705.0,2020,04/23/2020 03:45:11 PM,41.83631,-87.639624,"(41.836310224, -87.639624112)",3,4,16,5
13,081XX S COLES AVE,051A,ASSAULT,AGGRAVATED - HANDGUN,STREET,True,False,422,4.0,7.0,...,1851595.0,2020,07/08/2020 03:41:45 PM,41.74761,-87.549179,"(41.747609555, -87.549179329)",2,7,1,10
14,065XX S WOLCOTT AVE,0460,BATTERY,SIMPLE,RESIDENCE - PORCH / HALLWAY,False,False,726,7.0,15.0,...,1861251.0,2020,10/04/2020 03:43:55 PM,41.774878,-87.671375,"(41.77487752, -87.671374872)",6,9,27,23
15,062XX S ABERDEEN ST,0430,BATTERY,AGGRAVATED: OTHER DANG WEAPON,STREET,False,False,712,7.0,16.0,...,1863524.0,2005,02/28/2018 03:56:25 PM,41.781003,-87.652107,"(41.781002663, -87.652107119)",6,7,10,15


In [110]:
features = ["DayOfTheWeek", "Month", "Hour", "Year"]
targets = ["Location Description", "Description", "Primary Type", "Arrest"]

label_encoders = {}
pca_encoders = {}

for col in targets:
    label_encoders[col] = LabelEncoder()
    data_file[col] = label_encoders[col].fit_transform(data_file[col])

In [111]:
X = data_file[features]
Y = data_file[targets]

In [112]:
joblib.dump(label_encoders, "../models/label_encoders.pickle")

['../models/label_encoders.pickle']

In [113]:
X_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.3)

In [114]:
print("Shapes of training data:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("\nShapes of testing data:")
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)

Shapes of training data:
X_train: (6274, 4)
y_train: (6274, 4)

Shapes of testing data:
x_test: (2690, 4)
y_test: (2690, 4)


# SVM BELOW

In [115]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

In [116]:
param_grid = {'estimator__C': [0.1, 1, 10, 100, 1000], 'estimator__gamma': ['scale', 'auto'], "estimator_kernal": ["rbf", "linear", "poly", "precomputed"]}
grid_search = GridSearchCV(estimator=MultiOutputClassifier(SVC(kernel='rbf')), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [117]:
best_params = grid_search.best_params_
best_svm_model = MultiOutputClassifier(SVC(kernel='rbf', C=best_params['estimator__C'], gamma=best_params['estimator__gamma']))
best_svm_model.fit(X_train, y_train)

In [118]:
joblib.dump(best_svm_model, "../models/typeLocation.pickle")

['../models/typeLocation.pickle']

In [119]:
predictions = best_svm_model.predict(x_test)  

print(predictions[:50])
print("\n",y_test[:50])

[[103  40   2   0]
 [106  75  16   0]
 [ 14 227   6   0]
 [ 14 203  25   0]
 [ 14 117  17   0]
 [106 203   2   0]
 [ 88 203  25   0]
 [ 88 100   2   0]
 [106 160  25   0]
 [106 203   2   0]
 [ 14 100   2   0]
 [106 232   6   0]
 [ 14 203   2   0]
 [ 14 100   2   0]
 [106 115  25   0]
 [106 187  19   1]
 [106 232  25   0]
 [106 203  25   0]
 [ 14   0  25   0]
 [ 14 119  25   0]
 [ 88 115   6   0]
 [ 12 232   6   0]
 [ 14 203   6   0]
 [ 14 100   9   0]
 [106   0  25   0]
 [106 160  25   0]
 [ 14  20  25   0]
 [103  38   2   0]
 [ 21 110  17   1]
 [106 227   6   0]
 [ 40 203   2   0]
 [ 90   8   1   0]
 [ 53 195  17   1]
 [106   8   2   0]
 [106 203   1   0]
 [ 51 195  25   0]
 [ 38   0  25   0]
 [106 100   2   0]
 [106 232   6   0]
 [106 167  17   1]
 [106 232   6   0]
 [ 14 115   2   0]
 [106 203   2   0]
 [ 14 227   6   0]
 [ 88 100   2   0]
 [106 232  25   0]
 [ 14 203  25   0]
 [ 14  13   9   0]
 [103 203   2   0]
 [106   0  25   0]]

       Location Description  Description  Primar

In [120]:
# individual_accuracy = [predictions[col, row] == y_test.to_numpy()[col, row] for col in tqdm(range(predictions.shape[0]), desc="Doing Individual Prediction: ") for row in range(predictions.shape[1])]

In [121]:
#axis_accuracy = [np.array_equal(y_test.to_numpy()[:, index], predictions[:, index]) for index in tqdm(range(predictions.shape[1]), desc="Doing Axis Prediction:")]

In [122]:
# individual_counter = sum(1 for value in individual_accuracy if value == True)
#axis_counter = sum(1 for value in axis_accuracy if value == True)

In [123]:
# print("Indivudal Accuracy: ", individual_counter / len(individual_accuracy)) 
# print("Axis Accuracy: ", axis_counter / len(axis_accuracy))
# print(predictions)
# print(y_test)