# UK Accidents 分析及建模

## 导入公共库

In [1]:
from time import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

# 导入数据

## Accidents 事故数据

In [2]:
accidents=pd.read_csv('./data/uk_accident/Accidents0515.csv', index_col='Accident_Index')
print("数据条数: %d, 特征数量: %d" % accidents.shape)

数据条数: 1780653, 特征数量: 31


## Vehicles 涉及事故车辆数据

In [3]:
vehicles=pd.read_csv('./data/uk_accident/Vehicles0515.csv', error_bad_lines=False, index_col='Accident_Index', warn_bad_lines=False)
print("数据条数: %d, 特征数量: %d" % accidents.shape)

数据条数: 1780653, 特征数量: 31


## 连接上述两数据集
- 通过 **Accident_Index** 索引完成

In [4]:
uk_accident=accidents.join(vehicles, how='outer')
print("连接后数据量: %d, 特征数量: %d" % uk_accident.shape)

连接后数据量: 3144481, 特征数量: 52


# 初步机器学习

## 数据预处理

In [5]:
uk_raw_data=uk_accident.copy()

### 去除不适用的特征

In [6]:
# 删除无用列
uk_raw_data.drop(
    labels=[
        # 'Longitude',
        # 'Latitude',
        'Location_Easting_OSGR',
        'Location_Northing_OSGR',
        'Police_Force',
        'Number_of_Vehicles',
        'Number_of_Casualties',
        'Date',
        'Time',
        'Local_Authority_(District)',
        'Local_Authority_(Highway)',
        '1st_Road_Number',
        '2nd_Road_Number',
        'Did_Police_Officer_Attend_Scene_of_Accident',
        'LSOA_of_Accident_Location',
        'Vehicle_Reference',
        '1st_Point_of_Impact',
        'Sex_of_Driver',
        'Age_of_Driver',
        'Age_Band_of_Driver',
        'Was_Vehicle_Left_Hand_Drive?',
        'Journey_Purpose_of_Driver',
        'Driver_IMD_Decile',
        'Driver_Home_Area_Type',
    ],
    axis=1,
    inplace=True
)

In [7]:
uk_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3144481 entries, 200501BS00001 to 2015984141415
Data columns (total 28 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   Accident_Severity                        int64  
 1   Day_of_Week                              int64  
 2   1st_Road_Class                           int64  
 3   Road_Type                                int64  
 4   Speed_limit                              int64  
 5   Junction_Detail                          int64  
 6   Junction_Control                         int64  
 7   2nd_Road_Class                           int64  
 8   Pedestrian_Crossing-Human_Control        int64  
 9   Pedestrian_Crossing-Physical_Facilities  int64  
 10  Light_Conditions                         int64  
 11  Weather_Conditions                       int64  
 12  Road_Surface_Conditions                  int64  
 13  Special_Conditions_at_Site               int64  
 14  Carri

### 去除空数据

In [8]:
# 去除 -1 数据
for col in uk_raw_data.columns:
    uk_raw_data = (uk_raw_data[uk_raw_data[col]!=-1])
# 去除 NaN 数据
uk_raw_data.dropna(inplace=True)


KeyboardInterrupt



### 预处理结果

In [None]:
print('数据量: %d, 特征量: %d' % uk_raw_data.shape)

## 切分训练集、测试集数据

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
uk_y=uk_raw_data['Accident_Severity']
uk_x=uk_raw_data.drop(labels='Accident_Severity', axis=1)

In [None]:
# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(
    uk_x.values,
    uk_y.values,
    test_size=0.2,
    random_state=1
)
print('(数据量, 特征量)')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## 决策树 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree1=DecisionTreeClassifier(random_state=1)
t0=time()
# 训练决策树模型
decision_tree1.fit(X_train, y_train)
print(f'训练耗时: {time()-t0:.2f}s')
# 展示精度
acc_dt1=decision_tree1.score(X_test, y_test)
print(f'[Accuray]={acc_dt1*100:.2f}')

### 获取特征重要程度

In [None]:
importance_figure=plt.figure(figsize=(10, 18))
feat_importances = pd.DataFrame(
    {
        'feature': uk_x.columns,
        'importance': decision_tree1.feature_importances_, 
    }
)
feat_importances.sort_values(by='importance', ascending=False, inplace=True)
sns.barplot(
    data=feat_importances,
    x='importance',
    y='feature',
    orient='h',
)

In [None]:
importance_figure.savefig('feature_importance_0118.png', dpi=200, pad_inches=0.5, bbox_inches='tight')

# 筛选特征
- 根据决策树的 feat_importance 筛选部分结果执行进一步地机器学习
    - 01-18 16:52
        - Latitude
        - Longitude
        - Engine_Capacity_(CC)
        - Age_of_Driver
        - Age_of_Vehicle
        - Day_of_Week
- 查看特征相关性

## 查看相关性

In [None]:
feat_correlation=uk_x.corr()
corr_heatmap=plt.figure(figsize=(20, 10))
sns.heatmap(feat_correlation)

In [None]:
corr_heatmap.savefig('correlation-heatmap-0118.png', dpi=200, pad_inches=0.5, bbox_inches='tight')

## 筛选特征

In [None]:
# ①
uk_data=uk_raw_data.copy()
uk_data=uk_data[
    [
        'Accident_Severity',
        'Latitude',
        'Longitude',
        'Engine_Capacity_(CC)',
        'Age_of_Driver',
        'Age_of_Vehicle',
        'Day_of_Week'
    ]
]

In [None]:
# ②
uk_data=uk_raw_data.copy()
uk_data=uk_data[
    [
        'Accident_Severity',
        'Longitude',
        'Latitude',
        'Engine_Capacity_(CC)',
        'Age_of_Vehicle',
        'Day_of_Week',
        'Vehicle_Type',
        'Vehicle_Manoeuvre',
        'Junction_Detail',
        '1st_Road_Class',
        'Junction_Location',
    ]
]

In [None]:
print('数据量: %d, 特征量: %d' % uk_data.shape)

# 再次机器学习

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## 数据处理

In [None]:
y=uk_data['Accident_Severity']
X=uk_data.drop(labels='Accident_Severity', axis=1)

In [None]:
# 切分训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(
    X.values,
    y.values,
    test_size=0.2,
    random_state=1
)
print('(数据量, 特征量)')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Logistic Regression 逻辑回归

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
lr_cv=LogisticRegressionCV(cv=3, random_state=1, multi_class='multinomial')
t0=time()
lr_cv.fit(X_train, y_train)
print(f'训练耗时: {time()-t0:.2f}s')
print(f'[Accuracy]={accuracy_score(y_test, lr_cv.predict(X_test))*100:.2f}')

## Decision Tree 决策树

### 默认参数

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree=DecisionTreeClassifier(min_samples_leaf=16, max_features=4)
t0=time()
decision_tree.fit(X_train, y_train)
print(f'训练耗时: {time()-t0:.2f}s')
acc_decision_tree=decision_tree.score(X_test, y_test)
print(f"[Accuracy]={acc_decision_tree*100:.2f}")

### 参数搜索

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid={
    'min_samples_leaf': [1, 10, 100, 1000],
    'min_samples_split': [1, 10, 100, 1000],
    # 'max_depth': [1, 10, 100],
    # 'max_features': list(range(1, 7)),
}

In [None]:
grid_search=GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    n_jobs=-1,
    cv=3,
    verbose=2
)
t0=time()
grid_search.fit(X_train, y_train)
print(f'搜索耗时: {time()-t0:.2f}s')

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

- 01-18 17:41 -> {'max_depth': 1, 'min_samples_leaf': 100, 'min_samples_split': 10}
acc=0.8825635788369205

- 01-18 23:36 -> {'min_samples_leaf': 1000, 'min_samples_split': 10}
acc=0.8853849598909287

In [None]:
y_pred=grid_search.best_estimator_.predict(X_test)
print(f'[Accuracy]={accuracy_score(y_test, y_pred)*100:.2f}')

## 集成学习

### 准备数据

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
uk_ensemble=uk_raw_data.copy()

In [None]:
uk_ensemble_y=uk_ensemble['Accident_Severity']
uk_ensemble_x=uk_ensemble.drop(labels='Accident_Severity', axis=1)

In [None]:
# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(
    uk_ensemble_x.values,
    uk_ensemble_y.values,
    test_size=0.2,
    random_state=1,
)
print('(数据量, 特征量)')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### Bagging Method

In [None]:
from sklearn.ensemble import BaggingClassifier

#### DecisionTree Bagging

In [None]:
bagging_dt=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=20
)

In [None]:
t0=time()
bagging_dt.fit(X_train, y_train)
print(f"Decision Tree Bagging 训练耗时: {time()-t0:.2f}s")

In [None]:
acc_bagging_dt=accuracy_score(y_test, bagging_dt.predict(X_test))
print(f'[Accuracy]={acc_bagging_dt*100:.2f}')

#### Logistic Regression Bagging

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
bagging_lr=BaggingClassifier(
    base_estimator=LogisticRegression(),
    n_estimators=20
)

In [None]:
t0=time()
bagging_lr.fit(X_train, y_train)
print(f"Logistic Regression Bagging 训练耗时: {time()-t0:.2f}s")

In [None]:
acc_bagging_lr=accuracy_score(y_test, bagging_lr.predict(X_test))
print(f'[Accuracy]={acc_bagging_lr*100:.2f}')

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
ada=AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(
        max_depth=10,
        min_samples_leaf=100,
        min_samples_split=10
    ),
    n_estimators=100,
    algorithm="SAMME",
    # learning_rate=1.2,
    random_state=1
)

In [None]:
t0=time()
ada.fit(X_train, y_train)
print(f'Adaboost 训练耗时: {time()-t0:.2f}s')

In [None]:
ada_test_acc=[]
for pred in ada.staged_predict(X_test):
    ada_test_acc.append(accuracy_score(y_test, pred))
print(f'number of estimator = {len(ada.estimators_)}')
print(f'average acc={(sum(ada_test_acc)/len(ada_test_acc))*100:.2f}')

In [None]:
sns.lineplot(np.array(ada_test_acc))

### GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb=GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=1.0,
    max_depth=10,
    min_samples_leaf=100,
    min_samples_split=10,
    random_state=1,
)

In [None]:
t0=time()
gb.fit(X_train, y_train)
print(f'GradientBoosting 训练耗时: {time()-t0:.2f}s')

In [None]:
gb_test_acc=[]
for pred in gb.staged_predict(X_test):
    gb_test_acc.append(accuracy_score(y_test, pred))
print(f'number of estimator = {len(gb.estimators_)}')
print(f'average acc={(sum(gb_test_acc)/len(gb_test_acc))*100:.2f}')

In [None]:
sns.lineplot(np.array(gb_test_acc))

### HistGradientBoosting

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
hgb=HistGradientBoostingClassifier(
    max_iter=100,
    loss='categorical_crossentropy',
    learning_rate=0.6,
    max_leaf_nodes=None,
    scoring='accuracy',
    n_iter_no_change=10,
    tol=1e-6,
    verbose=1,
    random_state=1,
)

In [None]:
t0=time()
hgb.fit(X_train, y_train)
print(f'HistGradientBoosting 训练耗时: {time()-t0:.2f}s')

In [None]:
hgb_test_acc=[]
for pred in hgb.staged_predict(X_test):
    hgb_test_acc.append(accuracy_score(y_test, pred))
print(f'number of estimator = {len(gb.estimators_)}')
print(f'average acc={(sum(hgb_test_acc)/len(hgb_test_acc))*100:.2f}')
print(f'top acc={max(hgb_test_acc)*100:.2f}')

In [None]:
sns.lineplot(np.array(hgb_test_acc))

### Random Forest 随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier()

In [None]:
t0=time()
rf.fit(X_train, y_train)
print(f'随机森林训练耗时: {time()-t0:.2f}s')

In [None]:
print(f'[Accuracy]={accuracy_score(y_test, rf.predict(X_test))*100:.2f}')

In [None]:
# 展示特征重要程度
importance_figure=plt.figure()
feat_importances = pd.DataFrame(
    {
        'feature': X.columns,
        'importance': rf.feature_importances_, 
    }
)
feat_importances.sort_values(by='importance', ascending=False, inplace=True)
sns.barplot(
    data=feat_importances,
    x='importance',
    y='feature',
    orient='h'
)

# 神经网络

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
uk_mlp=uk_raw_data.copy()

In [None]:
uk_mlp_y=uk_ensemble['Accident_Severity']
uk_mlp_x=uk_ensemble.drop(labels='Accident_Severity', axis=1)

In [None]:
# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(
    uk_mlp_x.values,
    uk_mlp_y.values,
    test_size=0.2,
    random_state=1,
)
print('(数据量, 特征量)')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA(n_components=6)
pca.fit(X_train)
X_train_pca=pca.transform(X_train)
X_test_pca=pca.transform(X_test)

## MLP

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()
scaler.fit(X_train_pca)
X_train_scaled=scaler.transform(X_train_pca)
X_test_scaled=scaler.transform(X_test_pca)

In [None]:
X_train_scaled.shape

In [None]:
y_train_scaled.shape

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp=MLPClassifier(
    hidden_layer_sizes=(100, 25, 6),
    solver='sgd',
    learning_rate_init=0.5,
    learning_rate='adaptive',
    verbose=True,
    max_iter=1000,
)

In [None]:
t0=time()
mlp.fit(X_train_scaled, y_train)
print(f'MLP 训练耗时: {time()-t0:.2f}s')

In [None]:
sns.lineplot(mlp.loss_curve_)

In [None]:
acc_mlp=accuracy_score(y_test, mlp.predict(X_test_scaled))
print(f'[Accuracy]={acc_mlp*100:.2f}')

## MLP with Selected Features

In [None]:
mlp_data=uk_raw_data.copy()
mlp_data=mlp_data[
    [
        'Accident_Severity',
        'Latitude',
        'Longitude',
        'Engine_Capacity_(CC)',
        'Age_of_Driver',
        'Age_of_Vehicle',
        'Day_of_Week'
    ]
]

In [None]:
mlp_y=mlp_data['Accident_Severity']
mlp_x=mlp_data.drop(labels='Accident_Severity', axis=1)

In [None]:
mlp_x.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    mlp_x,
    mlp_y,
    test_size=0.3,
    random_state=233
)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp=MLPClassifier(
    hidden_layer_sizes=(10, 5, 3),
    solver='sgd',
    learning_rate_init=0.3,
    learning_rate='adaptive',
    verbose=True,
    max_iter=1000,
)

In [None]:
t0=time()
mlp.fit(X_train, y_train)
print(f'MLP 训练耗时: {time()-t0:.2f}s')

In [None]:
acc_mlp=accuracy_score(y_test, mlp.predict(X_test_scaled))
print(f'[Accuracy]={acc_mlp*100:.2f}')

In [None]:
sns.lineplot(mlp.loss_curve_)

# extra

## 数据处理

In [None]:
extra_data=uk_data.copy()

In [None]:
extra_data.info()

In [None]:
y=uk_data['Accident_Severity']
X=uk_data.drop(labels=[
    'Accident_Severity',
    'Vehicle_Manoeuvre',
    'Junction_Detail',
    '1st_Road_Class',
    'Vehicle_Type',
    'Junction_Location',
], axis=1)

In [None]:
# 切分训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(
    X.values,
    y.values,
    test_size=0.3
)
print('(数据量, 特征量)')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Decision Tree 决策树

### 默认参数

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree=DecisionTreeClassifier(min_samples_leaf=16, max_features=4)
t0=time()
decision_tree.fit(X_train, y_train)
print(f'训练耗时: {time()-t0:.2f}s')
acc_decision_tree=decision_tree.score(X_test, y_test)
print(f"[Accuracy]={acc_decision_tree*100:.2f}")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
for w in ['uniform', 'distance']:
    knn=KNeighborsClassifier(n_neighbors=3, weights=w)
    t0=time()
    knn.fit(X_train, y_train)
    print(f'weight {w} -> cost {time()-t0:.2f}s')
    print(f'[Accuracy]={knn.score(X_test, y_test)*100:.2f}')