In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import category_encoders as ce

import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, log_loss, matthews_corrcoef, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight



In [2]:
# Đọc file CSV
df = pd.read_csv('Traffic_Crashes_-_Crashes.csv')

In [3]:
df.describe()

Unnamed: 0,POSTED_SPEED_LIMIT,LANE_CNT,STREET_NO,BEAT_OF_OCCURRENCE,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE
count,794956.0,199006.0,794956.0,794951.0,794956.0,793215.0,793215.0,793215.0,793215.0,793215.0,793215.0,793215.0,794956.0,794956.0,794956.0,789567.0,789567.0
mean,28.402547,13.33032,3688.085435,1242.916397,2.034919,0.189448,0.001195,0.020032,0.107025,0.061195,2.004313,0.0,13.2028,4.122344,6.787978,41.854788,-87.67345
std,6.179272,2961.623,2887.990566,705.268208,0.452475,0.565654,0.037456,0.165673,0.422095,0.318451,1.158715,0.0,5.568727,1.980411,3.405398,0.337523,0.686153
min,0.0,0.0,0.0,111.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-87.936193
25%,30.0,2.0,1248.0,714.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0,4.0,41.782429,-87.721709
50%,30.0,2.0,3201.0,1211.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,4.0,7.0,41.874743,-87.674094
75%,30.0,4.0,5600.0,1822.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,17.0,6.0,10.0,41.924336,-87.6333
max,99.0,1191625.0,451100.0,6100.0,18.0,21.0,4.0,10.0,21.0,15.0,61.0,0.0,23.0,7.0,12.0,42.02278,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794956 entries, 0 to 794955
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                794956 non-null  object 
 1   CRASH_DATE_EST_I               59606 non-null   object 
 2   CRASH_DATE                     794956 non-null  object 
 3   POSTED_SPEED_LIMIT             794956 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         794956 non-null  object 
 5   DEVICE_CONDITION               794956 non-null  object 
 6   WEATHER_CONDITION              794956 non-null  object 
 7   LIGHTING_CONDITION             794956 non-null  object 
 8   FIRST_CRASH_TYPE               794956 non-null  object 
 9   TRAFFICWAY_TYPE                794956 non-null  object 
 10  LANE_CNT                       199006 non-null  float64
 11  ALIGNMENT                      794956 non-null  object 
 12  ROADWAY_SURFACE_COND          

In [5]:
print('This data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))

This data has 794956 rows and 48 columns


In [6]:
print("Dataset missing values: \n", df.isna().sum())
plt.show()

Dataset missing values: 
 CRASH_RECORD_ID                       0
CRASH_DATE_EST_I                 735350
CRASH_DATE                            0
POSTED_SPEED_LIMIT                    0
TRAFFIC_CONTROL_DEVICE                0
DEVICE_CONDITION                      0
WEATHER_CONDITION                     0
LIGHTING_CONDITION                    0
FIRST_CRASH_TYPE                      0
TRAFFICWAY_TYPE                       0
LANE_CNT                         595950
ALIGNMENT                             0
ROADWAY_SURFACE_COND                  0
ROAD_DEFECT                           0
REPORT_TYPE                       23258
CRASH_TYPE                            0
INTERSECTION_RELATED_I           612741
NOT_RIGHT_OF_WAY_I               758229
HIT_AND_RUN_I                    546330
DAMAGE                                0
DATE_POLICE_NOTIFIED                  0
PRIM_CONTRIBUTORY_CAUSE               0
SEC_CONTRIBUTORY_CAUSE                0
STREET_NO                             0
STREET_DIRECTI

In [7]:
# Danh sách các cột cần xóa
columns_to_drop = [
    "CRASH_RECORD_ID", "CRASH_DATE_EST_I","CRASH_DATE", "LANE_CNT", "REPORT_TYPE",
    "INTERSECTION_RELATED_I", "NOT_RIGHT_OF_WAY_I", "HIT_AND_RUN_I",
    "DATE_POLICE_NOTIFIED", "SEC_CONTRIBUTORY_CAUSE", "STREET_NO",
    "STREET_DIRECTION", "STREET_NAME", "BEAT_OF_OCCURRENCE",
    "PHOTOS_TAKEN_I", "STATEMENTS_TAKEN_I", "DOORING_I", "WORK_ZONE_I",
    "WORK_ZONE_TYPE", "WORKERS_PRESENT_I", "INJURIES_TOTAL", "INJURIES_FATAL",
    "INJURIES_INCAPACITATING", "INJURIES_NON_INCAPACITATING",
    "INJURIES_REPORTED_NOT_EVIDENT", "INJURIES_NO_INDICATION", "INJURIES_UNKNOWN", "LATITUDE", "LONGITUDE", "LOCATION", "MOST_SEVERE_INJURY"
]

# Xóa các cột, bỏ qua các cột không tồn tại
df = df.drop(columns=columns_to_drop, errors='ignore')

# Kiểm tra kết quả
print("Danh sách cột sau khi xóa:")
print(df.columns)


Danh sách cột sau khi xóa:
Index(['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 'NUM_UNITS',
       'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH'],
      dtype='object')


In [8]:
unique_count = df['CRASH_TYPE'].nunique()
print(f"Số lượng giá trị khác nhau trong CRASH_TYPE: {unique_count}")

value_counts = df['CRASH_TYPE'].value_counts()
print("\nSố lần xuất hiện của từng giá trị trong CRASH_TYPE:")
print(value_counts)


Số lượng giá trị khác nhau trong CRASH_TYPE: 2

Số lần xuất hiện của từng giá trị trong CRASH_TYPE:
CRASH_TYPE
NO INJURY / DRIVE AWAY              583177
INJURY AND / OR TOW DUE TO CRASH    211779
Name: count, dtype: int64


In [9]:

# Loại bỏ các hàng null
df = df.dropna()

In [10]:


# Biến đổi cột 'CRASH_TYPE' thành nhị phân
df['CRASH_TYPE'] = df['CRASH_TYPE'].apply(lambda x: 1 if x == 'INJURY AND / OR TOW DUE TO CRASH' else 0)

# Mã hóa các cột dạng chuỗi thành số nguyên
categorical_cols = [
    'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION',
    'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT', 'NUM_UNITS',
    'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE',
    'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH'
]

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Tách dữ liệu thành đầu vào (X) và nhãn mục tiêu (y)
X = df.drop('CRASH_TYPE', axis=1)  # X: Các biến độc lập
y = df['CRASH_TYPE']              # y: Nhãn mục tiêu

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Bước 2: Thiết lập mô hình Logistic Regression
# ==============================================
# Sử dụng class_weight='balanced' để xử lý mất cân bằng dữ liệu
logreg_model = LogisticRegression(class_weight='balanced', random_state=42, solver='liblinear')

# Định nghĩa bộ tham số cho GridSearchCV
param_grid_logreg = {
    'C': [0.1, 1, 10],            # Tham số điều chỉnh mức độ regularization
    'penalty': ['l1', 'l2'],      # Loại phạt (L1 hoặc L2)
    'solver': ['liblinear']       # Solver phù hợp với các lựa chọn penalty
}

# Thiết lập GridSearchCV để tìm kiếm tham số tối ưu
grid_search_logreg = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid_logreg,
    scoring='roc_auc',           # Sử dụng AUC làm tiêu chí đánh giá
    cv=5,                        # Số lượng tập chéo
    n_jobs=-1,                   # Sử dụng toàn bộ CPU
    verbose=1                    # Hiển thị quá trình tìm kiếm
)

# Huấn luyện mô hình với tập huấn luyện
grid_search_logreg.fit(X_train, y_train)

# Bước 3: Đánh giá mô hình
# ==============================================
# Lấy mô hình tốt nhất từ GridSearchCV
best_logreg_model = grid_search_logreg.best_estimator_

# Hiển thị tham số tốt nhất
print("\nBest Parameters from GridSearchCV (Logistic Regression):")
print(grid_search_logreg.best_params_)

# Dự đoán trên tập kiểm tra
y_pred_logreg = best_logreg_model.predict(X_test)              # Nhãn dự đoán
y_prob_logreg = best_logreg_model.predict_proba(X_test)[:, 1]  # Xác suất dự đoán

# Báo cáo đánh giá mô hình
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_logreg))

# Tính các chỉ số đánh giá
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
auc_score_logreg = roc_auc_score(y_test, y_prob_logreg)
log_loss_logreg = log_loss(y_test, y_prob_logreg)
mcc_logreg = matthews_corrcoef(y_test, y_pred_logreg)
balanced_accuracy_logreg = balanced_accuracy_score(y_test, y_pred_logreg)

# In kết quả đánh giá
print(f"Accuracy (Logistic Regression): {accuracy_logreg:.2f}")
print(f"AUC (Logistic Regression): {auc_score_logreg:.2f}")
print(f"Log Loss (Logistic Regression): {log_loss_logreg:.2f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc_logreg:.2f}")
print(f"Balanced Accuracy (Logistic Regression): {balanced_accuracy_logreg:.2f}")



Fitting 5 folds for each of 6 candidates, totalling 30 fits

Best Parameters from GridSearchCV (Logistic Regression):
{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.84      0.66      0.74    174953
           1       0.41      0.66      0.51     63534

    accuracy                           0.66    238487
   macro avg       0.63      0.66      0.62    238487
weighted avg       0.73      0.66      0.68    238487

Accuracy (Logistic Regression): 0.66
AUC (Logistic Regression): 0.72
Log Loss (Logistic Regression): 0.62
Matthews Correlation Coefficient (MCC): 0.28
Balanced Accuracy (Logistic Regression): 0.66
