In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
file_path = r'C:\Users\nhl08\OneDrive\Documents\AI02\Udemy\Forecasting Crime\cleaned_train.csv'
df = pd.read_csv(file_path)
df.shape

(20000, 22)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Latitude               20000 non-null  float64
 1   Longitude              20000 non-null  float64
 2   Time_Occurred          20000 non-null  float64
 3   Area_ID                20000 non-null  float64
 4   Reporting_District_no  20000 non-null  float64
 5   Part 1-2               20000 non-null  float64
 6   Victim_Age             20000 non-null  float64
 7   Premise_Code           20000 non-null  float64
 8   Weapon_Used_Code       20000 non-null  float64
 9   Year_Reported          20000 non-null  float64
 10  Month_Reported         20000 non-null  float64
 11  Day_Reported           20000 non-null  float64
 12  Location               20000 non-null  float64
 13  Area_Name              20000 non-null  float64
 14  Modus_Operandi         20000 non-null  float64
 15  Vi

In [9]:
X = np.array(df.iloc[:, : -1].values)
y = np.array(df.iloc[:, -1].values)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(X.shape)
print(y_encoded.shape)

(20000, 21)
(20000,)


In [11]:
from joblib import dump

dump(label_encoder, 'label_encoder.joblib')

['label_encoder.joblib']

In [10]:
print(y_encoded)

[4 4 4 ... 4 1 4]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(16000, 21) (4000, 21) (16000,) (4000,)


In [14]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred, zero_division=1)
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    
    return accuracy, report, conf_matrix

In [15]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

print(evaluate_model(lr_model, X_train, y_train, X_test, y_test))

(0.61475, '              precision    recall  f1-score   support\n\n           0       1.00      0.00      0.00        32\n           1       0.00      0.00      0.00       374\n           2       1.00      0.00      0.00       267\n           3       0.00      0.00      0.00        35\n           4       0.66      0.87      0.75      2303\n           5       0.48      0.45      0.46       989\n\n    accuracy                           0.61      4000\n   macro avg       0.52      0.22      0.20      4000\nweighted avg       0.57      0.61      0.55      4000\n', array([[   0,    0,    0,    0,   19,   13],
       [   0,    0,    0,    0,  241,  133],
       [   0,    0,    0,    2,  223,   42],
       [   0,    0,    0,    0,   26,    9],
       [   0,    2,    0,    1, 2013,  287],
       [   0,    3,    0,    2,  538,  446]], dtype=int64))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()

print(evaluate_model(dt_model, X_train, y_train, X_test, y_test))

(0.881, '              precision    recall  f1-score   support\n\n           0       0.35      0.53      0.42        32\n           1       0.68      0.67      0.68       374\n           2       0.88      0.84      0.86       267\n           3       0.12      0.17      0.14        35\n           4       0.95      0.95      0.95      2303\n           5       0.86      0.85      0.86       989\n\n    accuracy                           0.88      4000\n   macro avg       0.64      0.67      0.65      4000\nweighted avg       0.89      0.88      0.88      4000\n', array([[  17,    7,    0,    0,    0,    8],
       [   5,  252,   19,   20,   38,   40],
       [   1,   28,  223,    9,    5,    1],
       [   2,   15,    7,    6,    5,    0],
       [   1,   22,    4,   12, 2181,   83],
       [  22,   44,    1,    4,   73,  845]], dtype=int64))


In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()

print(evaluate_model(rfc_model, X_train, y_train, X_test, y_test))

(0.90125, '              precision    recall  f1-score   support\n\n           0       0.62      0.47      0.54        32\n           1       0.79      0.65      0.71       374\n           2       0.81      0.87      0.84       267\n           3       0.80      0.11      0.20        35\n           4       0.95      0.95      0.95      2303\n           5       0.86      0.93      0.89       989\n\n    accuracy                           0.90      4000\n   macro avg       0.81      0.66      0.69      4000\nweighted avg       0.90      0.90      0.90      4000\n', array([[  15,    1,    1,    0,    1,   14],
       [   1,  242,   36,    0,   36,   59],
       [   0,   17,  232,    0,   18,    0],
       [   1,   15,    4,    4,   11,    0],
       [   0,   21,    9,    1, 2193,   79],
       [   7,   11,    3,    0,   49,  919]], dtype=int64))


In [18]:
from xgboost import XGBClassifier

xgbc_model = XGBClassifier()

print(evaluate_model(xgbc_model, X_train, y_train, X_test, y_test))

(0.91525, '              precision    recall  f1-score   support\n\n           0       0.60      0.66      0.63        32\n           1       0.78      0.74      0.76       374\n           2       0.93      0.89      0.91       267\n           3       0.60      0.26      0.36        35\n           4       0.96      0.96      0.96      2303\n           5       0.87      0.91      0.89       989\n\n    accuracy                           0.92      4000\n   macro avg       0.79      0.74      0.75      4000\nweighted avg       0.91      0.92      0.91      4000\n', array([[  21,    0,    0,    0,    0,   11],
       [   2,  278,   13,    3,   20,   58],
       [   0,   27,  237,    0,    3,    0],
       [   0,   19,    3,    9,    4,    0],
       [   1,   19,    3,    3, 2214,   63],
       [  11,   15,    0,    0,   61,  902]], dtype=int64))


In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)

print(evaluate_model(knn_model, X_train, y_train, X_test, y_test))

(0.74625, '              precision    recall  f1-score   support\n\n           0       0.11      0.06      0.08        32\n           1       0.40      0.37      0.38       374\n           2       0.54      0.56      0.55       267\n           3       0.14      0.03      0.05        35\n           4       0.84      0.89      0.86      2303\n           5       0.71      0.65      0.68       989\n\n    accuracy                           0.75      4000\n   macro avg       0.46      0.43      0.43      4000\nweighted avg       0.73      0.75      0.74      4000\n', array([[   2,    5,    0,    0,    5,   20],
       [   2,  138,   23,    1,  109,  101],
       [   2,   22,  150,    0,   68,   25],
       [   0,    3,    3,    1,   18,   10],
       [   7,   76,   57,    3, 2052,  108],
       [   6,  100,   47,    2,  192,  642]], dtype=int64))


In [20]:
import joblib

joblib.dump(xgbc_model, 'xgboost_model.joblib')

['xgboost_model.joblib']