In [8]:
import pandas as pd
import numpy as np

In [9]:
file_path = r'C:\Users\nhl08\OneDrive\Documents\AI02\Udemy\Forecasting Crime\train.csv'
df_train = pd.read_csv(file_path)

In [10]:
file_path_test = r'C:\Users\nhl08\OneDrive\Documents\AI02\Udemy\Forecasting Crime\test.csv'
df_test = pd.read_csv(file_path_test)

In [11]:
X = df_train.drop(columns=['Crime_Category'])
y = df_train['Crime_Category']

In [12]:
date_format = '%m/%d/%Y %I:%M:%S %p'

X['Date_Reported'] = pd.to_datetime(X['Date_Reported'], format=date_format)
X['Date_Occurred'] = pd.to_datetime(X['Date_Occurred'], format=date_format)

In [13]:
X['Year_Occurred'] = X['Date_Occurred'].dt.year
X['Month_Occurred'] = X['Date_Occurred'].dt.month
X['Day_Occurred'] = X['Date_Occurred'].dt.day
X['DayOfWeek_Occurred'] = X['Date_Occurred'].dt.dayofweek
X['Hour_Occurred'] = X['Time_Occurred'].apply(lambda x: int(x // 100))


In [14]:
X['Year_Reported'] = X['Date_Reported'].dt.year
X['Month_Reported'] = X['Date_Reported'].dt.month
X['Day_Reported'] = X['Date_Reported'].dt.day
X['DayOfWeek_Reported'] = X['Date_Reported'].dt.dayofweek

In [15]:
df_test['Date_Reported'] = pd.to_datetime(df_test['Date_Reported'], format=date_format)
df_test['Date_Occurred'] = pd.to_datetime(df_test['Date_Occurred'], format=date_format)

In [16]:
df_test['Year_Occurred'] = df_test['Date_Occurred'].dt.year
df_test['Month_Occurred'] = df_test['Date_Occurred'].dt.month
df_test['Day_Occurred'] = df_test['Date_Occurred'].dt.day
df_test['DayOfWeek_Occurred'] = df_test['Date_Occurred'].dt.dayofweek
df_test['Hour_Occurred'] = df_test['Time_Occurred'].apply(lambda x: int(x // 100))

In [17]:
df_test['Year_Reported'] = df_test['Date_Reported'].dt.year
df_test['Month_Reported'] = df_test['Date_Reported'].dt.month
df_test['Day_Reported'] = df_test['Date_Reported'].dt.day
df_test['DayOfWeek_Reported'] = df_test['Date_Reported'].dt.dayofweek

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer

In [19]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(y_encoded)

[4 4 4 ... 4 1 4]


In [20]:
X['Modus_Operandi'] = X['Modus_Operandi'].fillna('').apply(lambda x: x.split())
df_test['Modus_Operandi'] = df_test['Modus_Operandi'].fillna('').apply(lambda x: x.split())

In [21]:
mlb = MultiLabelBinarizer()

train_mo = pd.DataFrame(mlb.fit_transform(X['Modus_Operandi']), columns=mlb.classes_, index=X.index)
X = X.join(train_mo).drop(columns=['Modus_Operandi'])

In [22]:
test_mo = pd.DataFrame(mlb.transform(df_test['Modus_Operandi']), columns=mlb.classes_, index=df_test.index)
df_test = df_test.join(test_mo).drop(columns=['Modus_Operandi'])



In [23]:
columns_to_drop = ['Location', 'Cross_Street', 'Premise_Description', 
                   'Weapon_Used_Code', 'Area_Name','Weapon_Description',
                   'Date_Occurred', 'Date_Reported', 'Time_Occurred']
existing_columns_to_drop = [col for col in columns_to_drop if col in X.columns]
X = X.drop(columns=existing_columns_to_drop)
df_test = df_test.drop(columns=existing_columns_to_drop)

In [24]:
label_encoder_status = LabelEncoder()

X['Status'] = label_encoder_status.fit_transform(X['Status'])

median_victim_age = X[X['Victim_Age'] > 0]['Victim_Age'].median()
X['Victim_Age'] = X['Victim_Age'].apply(lambda x: median_victim_age if x <= 0 else x)

X['Victim_Sex'].fillna('M', inplace=True)

X['Victim_Descent'].fillna('B', inplace=True)

In [25]:
df_test['Status'] = label_encoder_status.transform(df_test['Status'])

df_test['Victim_Age'] = df_test['Victim_Age'].apply(lambda x: median_victim_age if x <= 0 else x)

df_test['Victim_Sex'].fillna('M', inplace=True)

df_test['Victim_Descent'].fillna('B', inplace=True)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.25, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         ['Victim_Sex', 'Victim_Descent', 'Status', 'Area_ID']),
        
        ('ss', StandardScaler(), ['Latitude', 'Longitude']),
        
        ('tfidf', TfidfVectorizer(), 'Status_Description'),
        
        ('passthrough', 'passthrough', ['Premise_Code', 'Reporting_District_no', 'Victim_Age',
                                        'Year_Reported', 'Month_Reported', 'Day_Reported', 
                                        'DayOfWeek_Reported', 'Year_Occurred', 'Month_Occurred', 
                                        'Day_Occurred', 'DayOfWeek_Occurred', 'Hour_Occurred', 
                                        ])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

transformed_data = preprocessor.fit_transform(X)

column_names = (preprocessor.get_feature_names_out())
transformed_df = pd.DataFrame(transformed_data, columns=column_names)

transformed_df.head()

Unnamed: 0,Victim_Sex_F,Victim_Sex_H,Victim_Sex_M,Victim_Sex_X,Victim_Descent_A,Victim_Descent_B,Victim_Descent_C,Victim_Descent_D,Victim_Descent_F,Victim_Descent_G,...,2126,2157,2201,2204,2303,2304,3003,3004,4025,9999
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])
pipeline

In [29]:
X_train = pipeline.fit_transform(X_train)
X_val = pipeline.transform(X_val)
X_test = pipeline.transform(df_test)

In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [31]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred, zero_division=1)
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    
    return accuracy, report, conf_matrix

In [32]:
from xgboost import XGBClassifier

xgbc_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

print(evaluate_model(xgbc_model, X_train, y_train, X_val, y_val))

Parameters: { "use_label_encoder" } are not used.



(0.951, '              precision    recall  f1-score   support\n\n           0       0.69      0.80      0.74        41\n           1       0.81      0.85      0.83       460\n           2       0.96      0.91      0.93       337\n           3       0.42      0.24      0.30        42\n           4       0.99      0.99      0.99      2884\n           5       0.94      0.95      0.95      1236\n\n    accuracy                           0.95      5000\n   macro avg       0.80      0.79      0.79      5000\nweighted avg       0.95      0.95      0.95      5000\n', array([[  33,    1,    0,    0,    0,    7],
       [   4,  391,    9,    9,   16,   31],
       [   1,   23,  307,    4,    1,    1],
       [   0,   23,    5,   10,    2,    2],
       [   0,   12,    0,    0, 2844,   28],
       [  10,   34,    0,    1,   21, 1170]], dtype=int64))


In [33]:
from lightgbm import LGBMClassifier

lxgbc_model = LGBMClassifier(random_state=42)

print(evaluate_model(lxgbc_model, X_train, y_train, X_val, y_val))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001952 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1565
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 225
[LightGBM] [Info] Start training from score -4.400870
[LightGBM] [Info] Start training from score -2.409428
[LightGBM] [Info] Start training from score -2.690210
[LightGBM] [Info] Start training from score -4.695825
[LightGBM] [Info] Start training from score -0.535346
[LightGBM] [Info] Start training from score -1.446469
(0.9542, '              precision    recall  f1-score   support\n\n           0       0.74      0.63      0.68        41\n           1       0.80      0.87      0.83       460\n           2       0.97      0.91      0.94       337\n           3       0.58      0.17      0.26        42\n           4       0.99      0.99  

In [34]:
y_pred_lxgbc = lxgbc_model.predict(X_test)

In [35]:
y_test_pred_lgbm_original = label_encoder.inverse_transform(y_pred_lxgbc)

print(y_test_pred_lgbm_original)

['Violent Crimes' 'Property Crimes' 'Fraud and White-Collar Crimes' ...
 'Violent Crimes' 'Violent Crimes' 'Property Crimes']


In [36]:
predict_numbers = np.arange(1, len(y_test_pred_lgbm_original) + 1)

data = pd.DataFrame({
    "ID": predict_numbers,
    "Crime_Category": y_test_pred_lgbm_original
})

data.to_csv("submission.csv", index=False)