In [96]:
from importlib import reload
from itertools import chain

import numpy as np
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

from evaluation import evaluator

In [97]:
path = "../datasets/"
data = arff.loadarff('%sSeattle_Crime_Data_06-23-2019-4.arff' % path)
df = pd.DataFrame(data[0])
for column in df.columns:
    # decode bytes to string for each column
    df[column] = df[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

df = df.drop('Report_Number', axis=1)
original_dtypes = df.dtypes
df.replace(['NONE', '?', '', 'None', b'', np.NaN, 'UNKNOWN'], pd.NA, inplace=True)
df

Unnamed: 0,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood
0,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK
1,1.0,2359.0,SEX OFFENSE-OTHER,SEXOFF-INDECENT LIBERTIES,,,,
2,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK
3,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP
4,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS
...,...,...,...,...,...,...,...,...
523585,1713.0,1713.0,FAMILY OFFENSE-NONVIOLENT,CHILD-OTHER,SOUTH,O,O3,MID BEACON HILL
523586,730.0,1721.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,EAST,C,C2,MONTLAKE/PORTAGE BAY
523587,1724.0,1724.0,ROBBERY-COMMERCIAL,ROBBERY-BUSINESS-BODYFORCE,SOUTH,S,S2,RAINIER BEACH
523588,1750.0,1904.0,THEFT-SHOPLIFT,THEFT-SHOPLIFT,NORTH,L,L2,NORTHGATE


In [98]:
for column in df.columns:
    if df.columns.dtype == object:
        print(column + "=" * 40)
        value_counts = df[column].value_counts()
        print(value_counts)


Occurred_Time
2200.0    13858
1800.0    13804
0.0       13420
2000.0    12836
1200.0    12598
          ...  
649.0        40
607.0        39
546.0        39
627.0        37
551.0        36
Name: count, Length: 1440, dtype: int64
Reported_Time
1300.0    824
1600.0    809
1530.0    799
1400.0    790
1230.0    787
         ... 
501.0      89
333.0      89
542.0      85
417.0      85
349.0      85
Name: count, Length: 1440, dtype: int64
Crime_Subcategory
CAR PROWL                              148263
THEFT-ALL OTHER                         54419
THEFT-SHOPLIFT                          48638
BURGLARY-RESIDENTIAL                    46843
MOTOR VEHICLE THEFT                     43529
BURGLARY-COMMERCIAL                     23531
THEFT-BUILDING                          21438
TRESPASS                                17722
NARCOTIC                                17381
AGGRAVATED ASSAULT                      15640
DUI                                     12838
ROBBERY-STREET                        

# Preprocessing

In [99]:
# split dataframe to features and target
X = df.drop('Primary_Offense_Description', axis=1)  # Features
y = df['Primary_Offense_Description']  #target 

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Occurred_Time,Reported_Time,Crime_Subcategory,Precinct,Sector,Beat,Neighborhood
6916,2108.0,2108.0,NARCOTIC,NORTH,N,N3,NORTHGATE
70131,2100.0,623.0,BURGLARY-RESIDENTIAL,NORTH,N,N2,BITTERLAKE
64578,2130.0,1033.0,MOTOR VEHICLE THEFT,WEST,D,D3,SLU/CASCADE
30304,2150.0,2335.0,NARCOTIC,SOUTH,S,S1,NEW HOLLY
218777,1858.0,1858.0,THEFT-ALL OTHER,SOUTH,S,S2,BRIGHTON/DUNLAP
...,...,...,...,...,...,...,...
259178,711.0,711.0,AGGRAVATED ASSAULT-DV,NORTH,J,J3,GREENWOOD
365838,500.0,822.0,CAR PROWL,NORTH,J,J3,ROOSEVELT/RAVENNA
131932,1135.0,1151.0,THEFT-SHOPLIFT,WEST,M,M3,DOWNTOWN COMMERCIAL
146867,2100.0,105.0,BURGLARY-RESIDENTIAL,NORTH,U,U1,ROOSEVELT/RAVENNA


In [100]:
print("null values in train data: " + str(X_train.isnull().sum().sum()))
print("null values in test data: " + str(X_test.isnull().sum().sum()))
print(X_train.dtypes)

null values in train data: 10869
null values in test data: 2759
Occurred_Time        object
Reported_Time        object
Crime_Subcategory    object
Precinct             object
Sector               object
Beat                 object
Neighborhood         object
dtype: object


In [101]:
imputer = SimpleImputer(strategy='most_frequent', missing_values=pd.NA)

imputer.fit(X_train)

# original_dtypes = X_train.dtypes
del original_dtypes['Primary_Offense_Description']
X_train_np = imputer.transform(X_train)
X_train = pd.DataFrame(X_train_np, index=X_train.index, columns=X_train.columns)
X_train = X_train.astype(original_dtypes)

X_test_np = imputer.transform(X_test)
X_test = pd.DataFrame(X_test_np, index=X_test.index, columns=X_test.columns)
X_test = X_test.astype(original_dtypes)

In [102]:
print("null values in train data: " + str(X_train.isnull().sum().sum()))
print("null values in test data: " + str(X_test.isnull().sum().sum()))

null values in train data: 0
null values in test data: 0


In [103]:
X_train

Unnamed: 0,Occurred_Time,Reported_Time,Crime_Subcategory,Precinct,Sector,Beat,Neighborhood
6916,2108.0,2108.0,NARCOTIC,NORTH,N,N3,NORTHGATE
70131,2100.0,623.0,BURGLARY-RESIDENTIAL,NORTH,N,N2,BITTERLAKE
64578,2130.0,1033.0,MOTOR VEHICLE THEFT,WEST,D,D3,SLU/CASCADE
30304,2150.0,2335.0,NARCOTIC,SOUTH,S,S1,NEW HOLLY
218777,1858.0,1858.0,THEFT-ALL OTHER,SOUTH,S,S2,BRIGHTON/DUNLAP
...,...,...,...,...,...,...,...
259178,711.0,711.0,AGGRAVATED ASSAULT-DV,NORTH,J,J3,GREENWOOD
365838,500.0,822.0,CAR PROWL,NORTH,J,J3,ROOSEVELT/RAVENNA
131932,1135.0,1151.0,THEFT-SHOPLIFT,WEST,M,M3,DOWNTOWN COMMERCIAL
146867,2100.0,105.0,BURGLARY-RESIDENTIAL,NORTH,U,U1,ROOSEVELT/RAVENNA


In [104]:
# find all numerical columns in X_train
numerical_columns = X_train.select_dtypes(include="number").columns
# convert numerical columns to list
numerical_columns = numerical_columns.tolist()
numerical_columns

['Occurred_Time', 'Reported_Time']

In [105]:
feature_mappings = {}

ordinal_columns = [col for col in X_train.columns if col in feature_mappings.keys()]
ordinal_categories = [feature_mappings[col] for col in ordinal_columns]
ordinal_columns

[]

In [106]:
categorical_columns = [col for col in X_train.columns if col not in ordinal_columns and col not in numerical_columns]

one_hot_encoding_limit = 5
one_hot_columns = [col for col in categorical_columns if X_train[col].nunique() <= one_hot_encoding_limit]

label_columns = [col for col in categorical_columns if col not in one_hot_columns]

print("one hot columns: ", one_hot_columns)
print("label columns: ", label_columns)

one hot columns:  ['Precinct']
label columns:  ['Crime_Subcategory', 'Sector', 'Beat', 'Neighborhood']


In [107]:
all = numerical_columns + ordinal_columns + one_hot_columns + label_columns
all.sort()
all

['Beat',
 'Crime_Subcategory',
 'Neighborhood',
 'Occurred_Time',
 'Precinct',
 'Reported_Time',
 'Sector']

In [108]:
preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_columns),
    ('categorical', OneHotEncoder(handle_unknown="ignore"), one_hot_columns),
    ('label', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), label_columns)
],
    remainder='passthrough'  # passthrough columns not listed in any pipeline
)

In [109]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('scaler', MinMaxScaler())
])

pipeline_standard = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('scaler', StandardScaler())
])
X_train_std = pipeline_standard.fit_transform(X_train)
X_test_std = pipeline_standard.transform(X_test)

# Preprocess training data
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [110]:
# Encode target variable
label_encoder = LabelEncoder()

#label = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}
#y_train = y_train.map(label)
#y_test = y_test.map(label)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Evaluation


## Neural Network

In [ ]:
reload(evaluator)
hyperparameters = {
    'solver': 'adam',
    'activation': 'relu',
    'learning_rate': 'constant',
    'hidden_layer_sizes': (7),
    #'random_state': 7895,
    'max_iter': 1500,
    'verbose': False,
    #'tol': 5e-5
}
hyperparameters_iterator = {'learning_rate_init': np.logspace(-6, -1, num=35)}
nn_learning_rate = evaluator.evaluate2_mean(MLPClassifier, X_train, y_train, X_test, y_test,
                                            hyperparameters=hyperparameters,
                                            hyperparameters_iterate=hyperparameters_iterator, number_of_tests=5)
evaluator.draw_diagram2_list({"seattlecrime": nn_learning_rate}, x_axis='learning_rate_init',
                             y_axis=['accuracy', 'precision', 'recall', 'f1'], logaritmic=True)

Evaluating classifier:  MLPClassifier
Hyperparameters:  {'solver': 'adam', 'activation': 'relu', 'learning_rate': 'constant', 'hidden_layer_sizes': 7, 'max_iter': 1500, 'verbose': False}


In [ ]:
reload(evaluator)
hyperparameters = {
    'solver': 'adam',
    'activation': 'relu',
    'learning_rate': 'constant',
    #'hidden_layer_sizes': (15, 15, 15),
    #'random_state': 7895,
    'max_iter': 5000,
    'verbose': False,
    #'tol': 5e-5
}
hyperparameters_iterator = {'hidden_layer_sizes': [(i,) for i in chain(range(1, 10, 1), range(10, 20, 2))]}
nn_hidden_layer = evaluator.evaluate2_mean(MLPClassifier, X_train_std, y_train, X_test_std, y_test,
                                           hyperparameters=hyperparameters,
                                           hyperparameters_iterate=hyperparameters_iterator, number_of_tests=5)
evaluator.draw_diagram2_list({"seattlecrime": nn_hidden_layer}, x_axis='hidden_layer_sizes',
                             y_axis=['accuracy', 'precision', 'recall', 'f1'], logaritmic=False)

In [ ]:
reload(evaluator)
hyperparameters = {
    'solver': 'adam',
    'activation': 'relu',
    'learning_rate': 'constant',
    #'hidden_layer_sizes': (15, 15, 15),
    #'random_state': 7895,
    'max_iter': 5000,
    'verbose': False,
    #'tol': 5e-5
}
hyperparameters_iterator = {'hidden_layer_sizes': [(7,), (7, 7), (7, 7, 7), (7, 7, 7, 7), ]}
nn_hidden_layer_deepness = evaluator.evaluate2_mean(MLPClassifier, X_train_std, y_train, X_test_std, y_test,
                                                    hyperparameters=hyperparameters,
                                                    hyperparameters_iterate=hyperparameters_iterator, number_of_tests=5)
evaluator.draw_diagram2_list({"seattlecrime": nn_hidden_layer_deepness}, x_axis='hidden_layer_sizes',
                             y_axis=['accuracy', 'precision', 'recall', 'f1'], logaritmic=False)

In [ ]:
reload(evaluator)
hyperparameters = {
    'solver': 'adam',
    'activation': 'relu',
    'learning_rate': 'constant',
    'learning_rate_init': 1e-3,
    'hidden_layer_sizes': (15, 15, 15),
    #'random_state': 7895,
    #'max_iter': 800,
    'verbose': False,
    #'tol': 5e-5
}
hyperparameters_iterator = {'max_iter': range(50, 1000, 10)}
nn_max_iter = evaluator.evaluate2(MLPClassifier, X_train, y_train, X_test, y_test, hyperparameters=hyperparameters,
                                  hyperparameters_iterate=hyperparameters_iterator)
evaluator.draw_diagram2_list({"seattlecrime": nn_max_iter}, x_axis='max_iter',
                             y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'], logaritmic=False)

In [ ]:
evaluator.draw_diagram2_list({"seattlecrime": nn_max_iter}, x_axis='max_iter',
                             y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'], logaritmic=False)
reload(evaluator)
hyperparameters = {
    'solver': 'adam',
    #'activation': 'relu',
    'learning_rate': 'constant',
    'learning_rate_init': 1e-3,
    'hidden_layer_sizes': (15, 15, 15),
    #'random_state': 7895,
    'max_iter': 1000,
    'verbose': False,
    #'tol': 5e-5
}
hyperparameters_iterator = {'activation': ['identity', 'logistic', 'tanh', 'relu']}
act_function1 = evaluator.evaluate2(MLPClassifier, X_train, y_train, X_test, y_test, hyperparameters=hyperparameters,
                                    hyperparameters_iterate=hyperparameters_iterator)
act_function2 = evaluator.evaluate2(MLPClassifier, X_train, y_train, X_test, y_test, hyperparameters=hyperparameters,
                                    hyperparameters_iterate=hyperparameters_iterator)
act_function3 = evaluator.evaluate2(MLPClassifier, X_train, y_train, X_test, y_test, hyperparameters=hyperparameters,
                                    hyperparameters_iterate=hyperparameters_iterator)
evaluator.draw_diagram2_list({"breast1": act_function1, "breats2": act_function2, "breast3": act_function3},
                             x_axis='activation', y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'],
                             logaritmic=False)

## Random Forest

In [ ]:
from itertools import chain

hyperparameters = {
    #'n_estimators': 100,
    'criterion': 'gini',  # entropy, log_loss
    'verbose': False,
}
hyperparameters_iterator = {'n_estimators': chain(range(1, 25, 1), range(25, 200, 2))}
rf_n_estimators = evaluator.evaluate2(RandomForestClassifier, X_train, y_train, X_test, y_test,
                                      hyperparameters=hyperparameters,
                                      hyperparameters_iterate=hyperparameters_iterator)
evaluator.draw_diagram2_list({"seattlecrime": rf_n_estimators}, x_axis='n_estimators',
                             y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'], logaritmic=False)

In [ ]:
reload(evaluator)
hyperparameters = {
    'n_estimators': 5,
    #'criterion': 'gini', # entropy, log_loss
    'verbose': False,
}
hyperparameters_iterator = {'criterion': ['gini', 'entropy', 'log_loss']}
rf_criterion = evaluator.evaluate2(RandomForestClassifier, X_train, y_train, X_test, y_test,
                                   hyperparameters=hyperparameters,
                                   hyperparameters_iterate=hyperparameters_iterator)
rf_criterion2 = evaluator.evaluate2(RandomForestClassifier, X_train, y_train, X_test, y_test,
                                    hyperparameters=hyperparameters,
                                    hyperparameters_iterate=hyperparameters_iterator)
rf_criterion3 = evaluator.evaluate2(RandomForestClassifier, X_train, y_train, X_test, y_test,
                                    hyperparameters=hyperparameters,
                                    hyperparameters_iterate=hyperparameters_iterator)
evaluator.draw_diagram2_list({"breast1": rf_criterion, "breast2": rf_criterion2, "breats3": rf_criterion3},
                             x_axis='criterion', y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'],
                             logaritmic=False)

## Bayes

In [ ]:
reload(evaluator)

classifiers = [GaussianNB, MultinomialNB, BernoulliNB]
names = ['GaussianNB', 'MultinomialNB', 'BernoulliNB']

bayes_results = evaluator.evaluate_classifier(classifiers, X_train, y_train, X_test, y_test,
                                              hyperparameters=[{}, {}, {}, ], names=names, number_of_tests=10)
evaluator.draw_diagram2_list({"seattlecrime": bayes_results}, x_axis='classifier',
                             y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'], logaritmic=False)

## evaluate scalers

In [ ]:
reload(evaluator)
hyperparameters = {
    'solver': 'adam',
    'activation': 'relu',
    'learning_rate': 'constant',
    'learning_rate_init': 1e-3,
    'hidden_layer_sizes': (15, 15, 15),
    #'random_state': 7895,
    'max_iter': 1000,
    'verbose': False,
    #'tol': 5e-5
}

scaler_result = evaluator.evaluate_scaler(MLPClassifier, X_train=[X_train, X_train_std], y_train=[y_train, y_train],
                                          X_test=[X_test, X_test_std], y_test=[y_test, y_test],
                                          hyperparameters=hyperparameters, names=['MinMaxScaler', 'StandardScaler'],
                                          number_of_tests=10)
evaluator.draw_diagram2_list({"seattlecrime": scaler_result}, x_axis='scaler',
                             y_axis=['accuracy', 'precision', 'recall', 'f1', 'time'], logaritmic=False)

In [ ]:
combined_results_creditg = {
    # NN
    'nn_learning_rate': nn_learning_rate,
    'nn_hidden_layer': nn_hidden_layer,
    'nn_max_iter': nn_max_iter,
    'act_function': act_function1,
    # RF
    'rf_n_estimators': rf_n_estimators,
    'rf_criterion': rf_criterion,
    # Bayes
    'bayes_results': bayes_results,
    'scaler_result': scaler_result
}
import pickle

with open('seattle_crime_results.pkl', 'wb') as f:
    pickle.dump(combined_results_creditg, f)