#LGB

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import category_encoders as ce

import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import (classification_report, accuracy_score, roc_auc_score,
                             log_loss, matthews_corrcoef, balanced_accuracy_score)



In [2]:
# Đọc file CSV
df = pd.read_csv('Traffic_Crashes_-_Crashes.csv')

In [3]:
# Danh sách các cột cần xóa
columns_to_drop = [
    "CRASH_RECORD_ID", "CRASH_DATE_EST_I","CRASH_DATE", "LANE_CNT", "REPORT_TYPE",
    "INTERSECTION_RELATED_I", "NOT_RIGHT_OF_WAY_I", "HIT_AND_RUN_I",
    "DATE_POLICE_NOTIFIED", "SEC_CONTRIBUTORY_CAUSE", "STREET_NO",
    "STREET_DIRECTION", "STREET_NAME", "BEAT_OF_OCCURRENCE",
    "PHOTOS_TAKEN_I", "STATEMENTS_TAKEN_I", "DOORING_I", "WORK_ZONE_I",
    "WORK_ZONE_TYPE", "WORKERS_PRESENT_I", "INJURIES_TOTAL", "INJURIES_FATAL",
    "INJURIES_INCAPACITATING", "INJURIES_NON_INCAPACITATING",
    "INJURIES_REPORTED_NOT_EVIDENT", "INJURIES_NO_INDICATION", "INJURIES_UNKNOWN", "LATITUDE", "LONGITUDE", "LOCATION","MOST_SEVERE_INJURY"
]

# Xóa các cột, bỏ qua các cột không tồn tại
df = df.drop(columns=columns_to_drop, errors='ignore')

# Kiểm tra kết quả
print("Danh sách cột sau khi xóa:")
print(df.columns)

Danh sách cột sau khi xóa:
Index(['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 'NUM_UNITS',
       'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH'],
      dtype='object')


In [4]:
# Loại bỏ các hàng null
df = df.dropna()

In [5]:

# Chuyển đổi cột CRASH_TYPE thành nhị phân
df['CRASH_TYPE'] = df['CRASH_TYPE'].apply(lambda x: 1 if x == 'INJURY AND / OR TOW DUE TO CRASH' else 0)

# Mã hóa các cột phân loại thành số
categorical_cols = ['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 
                    'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT', 
                    'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 
                    'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH']

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Tách X và y
X = df.drop('CRASH_TYPE', axis=1)
y = df['CRASH_TYPE']

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# === Áp dụng SMOTE để cân bằng dữ liệu ===
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# === Thiết lập mô hình LightGBM ===
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

lgb_model = LGBMClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Huấn luyện mô hình trên dữ liệu đã được SMOTE
grid_search.fit(X_train_resampled, y_train_resampled)

# Mô hình tốt nhất
best_lgb_model = grid_search.best_estimator_
print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)

# === Dự đoán và đánh giá ===
y_pred_lgb = best_lgb_model.predict(X_test)
y_prob_lgb = best_lgb_model.predict_proba(X_test)[:, 1]

# Classification Report
print("\nClassification Report (LightGBM):")
print(classification_report(y_test, y_pred_lgb))

# Accuracy
accuracy = accuracy_score(y_test, y_pred_lgb)
print(f"Accuracy (LightGBM): {accuracy:.2f}")

# AUC Score
auc_score = roc_auc_score(y_test, y_prob_lgb)
print(f"AUC (LightGBM): {auc_score:.2f}")

# Log Loss
log_loss_value = log_loss(y_test, y_prob_lgb)
print(f"Log Loss (LightGBM): {log_loss_value:.2f}")

# Matthews Correlation Coefficient (MCC)
mcc_value = matthews_corrcoef(y_test, y_pred_lgb)
print(f"Matthews Correlation Coefficient (MCC): {mcc_value:.2f}")

# Balanced Accuracy
balanced_accuracy_value = balanced_accuracy_score(y_test, y_pred_lgb)
print(f"Balanced Accuracy (LightGBM): {balanced_accuracy_value:.2f}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Number of positive: 408224, number of negative: 408224
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 242
[LightGBM] [Info] Number of data points in the train set: 816448, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Best Parameters from GridSearchCV:
{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}

Classification Report (LightGBM):
              precision    recall  f1-score   support

           0       0.88      0.81      0.84    174953
           1       0.57      0.69      0.63     63534

    accuracy                           0.78    238487
   macro avg       0.73      0.75      0