In [3]:
import pandas as pd 
import numpy as np
import seaborn as sns
import category_encoders as ce
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (classification_report, accuracy_score, roc_auc_score,
                             log_loss, matthews_corrcoef, balanced_accuracy_score)
from xgboost import XGBClassifier


In [4]:
# Đọc file CSV
df = pd.read_csv('Traffic_Crashes_-_Crashes.csv')

In [5]:
df.describe()

Unnamed: 0,POSTED_SPEED_LIMIT,LANE_CNT,STREET_NO,BEAT_OF_OCCURRENCE,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE
count,794956.0,199006.0,794956.0,794951.0,794956.0,793215.0,793215.0,793215.0,793215.0,793215.0,793215.0,793215.0,794956.0,794956.0,794956.0,789567.0,789567.0
mean,28.402547,13.33032,3688.085435,1242.916397,2.034919,0.189448,0.001195,0.020032,0.107025,0.061195,2.004313,0.0,13.2028,4.122344,6.787978,41.854788,-87.67345
std,6.179272,2961.623,2887.990566,705.268208,0.452475,0.565654,0.037456,0.165673,0.422095,0.318451,1.158715,0.0,5.568727,1.980411,3.405398,0.337523,0.686153
min,0.0,0.0,0.0,111.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-87.936193
25%,30.0,2.0,1248.0,714.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0,4.0,41.782429,-87.721709
50%,30.0,2.0,3201.0,1211.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,4.0,7.0,41.874743,-87.674094
75%,30.0,4.0,5600.0,1822.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,17.0,6.0,10.0,41.924336,-87.6333
max,99.0,1191625.0,451100.0,6100.0,18.0,21.0,4.0,10.0,21.0,15.0,61.0,0.0,23.0,7.0,12.0,42.02278,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794956 entries, 0 to 794955
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                794956 non-null  object 
 1   CRASH_DATE_EST_I               59606 non-null   object 
 2   CRASH_DATE                     794956 non-null  object 
 3   POSTED_SPEED_LIMIT             794956 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         794956 non-null  object 
 5   DEVICE_CONDITION               794956 non-null  object 
 6   WEATHER_CONDITION              794956 non-null  object 
 7   LIGHTING_CONDITION             794956 non-null  object 
 8   FIRST_CRASH_TYPE               794956 non-null  object 
 9   TRAFFICWAY_TYPE                794956 non-null  object 
 10  LANE_CNT                       199006 non-null  float64
 11  ALIGNMENT                      794956 non-null  object 
 12  ROADWAY_SURFACE_COND          

In [7]:
print('This data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))

This data has 794956 rows and 48 columns


In [8]:
print("Dataset missing values: \n", df.isna().sum())
plt.show()

Dataset missing values: 
 CRASH_RECORD_ID                       0
CRASH_DATE_EST_I                 735350
CRASH_DATE                            0
POSTED_SPEED_LIMIT                    0
TRAFFIC_CONTROL_DEVICE                0
DEVICE_CONDITION                      0
WEATHER_CONDITION                     0
LIGHTING_CONDITION                    0
FIRST_CRASH_TYPE                      0
TRAFFICWAY_TYPE                       0
LANE_CNT                         595950
ALIGNMENT                             0
ROADWAY_SURFACE_COND                  0
ROAD_DEFECT                           0
REPORT_TYPE                       23258
CRASH_TYPE                            0
INTERSECTION_RELATED_I           612741
NOT_RIGHT_OF_WAY_I               758229
HIT_AND_RUN_I                    546330
DAMAGE                                0
DATE_POLICE_NOTIFIED                  0
PRIM_CONTRIBUTORY_CAUSE               0
SEC_CONTRIBUTORY_CAUSE                0
STREET_NO                             0
STREET_DIRECTI

In [10]:
# Danh sách các cột cần xóa
columns_to_drop = [
    "CRASH_RECORD_ID", "CRASH_DATE_EST_I","CRASH_DATE", "LANE_CNT", "REPORT_TYPE",
    "INTERSECTION_RELATED_I", "NOT_RIGHT_OF_WAY_I", "HIT_AND_RUN_I",
    "DATE_POLICE_NOTIFIED", "SEC_CONTRIBUTORY_CAUSE", "STREET_NO",
    "STREET_DIRECTION", "STREET_NAME", "BEAT_OF_OCCURRENCE",
    "PHOTOS_TAKEN_I", "STATEMENTS_TAKEN_I", "DOORING_I", "WORK_ZONE_I",
    "WORK_ZONE_TYPE", "WORKERS_PRESENT_I", "INJURIES_TOTAL", "INJURIES_FATAL",
    "INJURIES_INCAPACITATING", "INJURIES_NON_INCAPACITATING",
    "INJURIES_REPORTED_NOT_EVIDENT", "INJURIES_NO_INDICATION", "INJURIES_UNKNOWN", "LATITUDE", "LONGITUDE", "LOCATION",  "MOST_SEVERE_INJURY"
]

# Xóa các cột, bỏ qua các cột không tồn tại
df = df.drop(columns=columns_to_drop, errors='ignore')

# Kiểm tra kết quả
print("Danh sách cột sau khi xóa:")
print(df.columns)


Danh sách cột sau khi xóa:
Index(['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 'NUM_UNITS',
       'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH'],
      dtype='object')


In [11]:
unique_count = df['CRASH_TYPE'].nunique()
print(f"Số lượng giá trị khác nhau trong CRASH_TYPE: {unique_count}")

value_counts = df['CRASH_TYPE'].value_counts()
print("\nSố lần xuất hiện của từng giá trị trong CRASH_TYPE:")
print(value_counts)


Số lượng giá trị khác nhau trong CRASH_TYPE: 2

Số lần xuất hiện của từng giá trị trong CRASH_TYPE:
CRASH_TYPE
NO INJURY / DRIVE AWAY              583177
INJURY AND / OR TOW DUE TO CRASH    211779
Name: count, dtype: int64


In [13]:

# Loại bỏ các hàng null
df = df.dropna()

In [14]:


# Chuyển đổi cột CRASH_TYPE thành nhị phân
df['CRASH_TYPE'] = df['CRASH_TYPE'].apply(lambda x: 1 if x == 'INJURY AND / OR TOW DUE TO CRASH' else 0)

# Mã hóa các cột dạng chuỗi thành số
categorical_cols = ['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 
                    'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT', 
                    'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 
                    'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH']

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Tách X và y
X = df.drop('CRASH_TYPE', axis=1)  # X là dữ liệu đầu vào
y = df['CRASH_TYPE']  # y là nhãn mục tiêu

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Tính trọng số lớp để xử lý mất cân bằng dữ liệu
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

# === Thiết lập GridSearchCV cho XGBoost ===
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, 5, 10],
    'gamma': [0, 0.1, 0.3]
}

xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Huấn luyện mô hình với GridSearchCV
grid_search.fit(X_train, y_train)

# === Mô hình tốt nhất ===
best_xgb_model = grid_search.best_estimator_
print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)

# === Dự đoán và đánh giá ===
y_pred_xgb = best_xgb_model.predict(X_test)
y_prob_xgb = best_xgb_model.predict_proba(X_test)[:, 1]

# Classification Report
print("\nClassification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))

# Accuracy
accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy (XGBoost): {accuracy:.2f}")

# AUC Score
auc_score = roc_auc_score(y_test, y_prob_xgb)
print(f"AUC (XGBoost): {auc_score:.2f}")

# Log Loss
log_loss_value = log_loss(y_test, y_prob_xgb)
print(f"Log Loss (XGBoost): {log_loss_value:.2f}")

# Matthews Correlation Coefficient (MCC)
mcc_value = matthews_corrcoef(y_test, y_pred_xgb)
print(f"Matthews Correlation Coefficient (MCC): {mcc_value:.2f}")

# Balanced Accuracy
balanced_accuracy_value = balanced_accuracy_score(y_test, y_pred_xgb)
print(f"Balanced Accuracy (XGBoost): {balanced_accuracy_value:.2f}")

# === Kết thúc ===


Fitting 3 folds for each of 972 candidates, totalling 2916 fits


Parameters: { "use_label_encoder" } are not used.




Best Parameters from GridSearchCV:
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'scale_pos_weight': 1, 'subsample': 1.0}

Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.84      0.92      0.88    174953
           1       0.71      0.52      0.60     63534

    accuracy                           0.82    238487
   macro avg       0.78      0.72      0.74    238487
weighted avg       0.81      0.82      0.81    238487

Accuracy (XGBoost): 0.82
AUC (XGBoost): 0.84
Log Loss (XGBoost): 0.41
Matthews Correlation Coefficient (MCC): 0.49
Balanced Accuracy (XGBoost): 0.72
