In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np

from sklearn.decomposition import PCA
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import RobustScaler

from sklearn.utils import resample
from sklearn.model_selection import train_test_split, StratifiedKFold,GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score
from sklearn.tree import plot_tree
from imblearn import over_sampling

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import imbalanced_ensemble

warnings.filterwarnings("ignore")

data = pd.read_csv("M:/DataSet/usaccident/real/selectData.csv",index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212697 entries, 0 to 212696
Columns: 312 entries, TMC to yorkrockhillkuza29730
dtypes: float64(10), int64(302)
memory usage: 507.9 MB


In [2]:
Y = data['Severity']
X = data.drop(["Severity"], axis=1)

del data

X.shape, Y.shape

((212697, 311), (212697,))

划分数据集
=

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10, stratify=Y)

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((170157, 311), (42540, 311), (170157,), (42540,))

PCA
=

In [4]:
pca = PCA(n_components=0.999)
pca.fit(X_train)
print(pca.explained_variance_ratio_.sum())
print(len(pca.explained_variance_))

0.9990130811837407
111


In [5]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

X_train.shape, X_test.shape

((170157, 111), (42540, 111))

EasyEnsemble
=

In [6]:
ee = imbalanced_ensemble.ensemble.EasyEnsembleClassifier()
ee.fit(X_train, Y_train)

pred_ee = ee.predict(X_test)
y_train_pred = ee.predict(X_train[:30000])

mat_train = confusion_matrix(Y_train[:30000],y_train_pred)
mat_ee = confusion_matrix(Y_test, pred_ee)

print(f"confusion matrix :\n{mat_train}\n")
print(f"confusion matrix :\n{mat_ee}\n")
print(classification_report(Y_train[:30000],y_train_pred))
print(classification_report(Y_test, pred_ee))

del pred_ee, y_train_pred

confusion matrix :
[[    3     4     3     2]
 [ 2362 11353  8356  2445]
 [  205   788  4070   196]
 [   26    23    11   153]]

confusion matrix :
[[   13     6     6     0]
 [ 3328 16082 11949  3500]
 [  274  1075  5680   298]
 [   44    44    18   223]]

              precision    recall  f1-score   support

           1       0.00      0.25      0.00        12
           2       0.93      0.46      0.62     24516
           3       0.33      0.77      0.46      5259
           4       0.05      0.72      0.10       213

    accuracy                           0.52     30000
   macro avg       0.33      0.55      0.30     30000
weighted avg       0.82      0.52      0.59     30000

              precision    recall  f1-score   support

           1       0.00      0.52      0.01        25
           2       0.93      0.46      0.62     34859
           3       0.32      0.78      0.45      7327
           4       0.06      0.68      0.10       329

    accuracy                       

In [7]:
ee_params = {
    'n_estimators':[30,50,70,90,100],
    'oob_score':[False,True]
}
clf = GridSearchCV(ee, param_grid=ee_params,cv=5,scoring='recall',n_jobs=-1)
clf.fit(X_test, Y_test)


ee = imbalanced_ensemble.ensemble.EasyEnsembleClassifier(**clf.best_params_)
ee.fit(X_train, Y_train)

pred_ee = ee.predict(X_test)
y_train_pred = ee.predict(X_train[:30000])

mat_train = confusion_matrix(Y_train[:30000],y_train_pred)
mat_ee = confusion_matrix(Y_test, pred_ee)

print(f"confusion matrix :\n{mat_train}\n")
print(f"confusion matrix :\n{mat_ee}\n")
print(classification_report(Y_train[:30000],y_train_pred))
print(classification_report(Y_test, pred_ee))

del pred_ee, y_train_pred

confusion matrix :
[[    6     3     1     2]
 [ 2421 10857  8686  2552]
 [  197   798  4085   179]
 [   26    21    14   152]]

confusion matrix :
[[   14     5     6     0]
 [ 3344 15427 12393  3695]
 [  267  1106  5678   276]
 [   44    36    31   218]]

              precision    recall  f1-score   support

           1       0.00      0.50      0.00        12
           2       0.93      0.44      0.60     24516
           3       0.32      0.78      0.45      5259
           4       0.05      0.71      0.10       213

    accuracy                           0.50     30000
   macro avg       0.33      0.61      0.29     30000
weighted avg       0.82      0.50      0.57     30000

              precision    recall  f1-score   support

           1       0.00      0.56      0.01        25
           2       0.93      0.44      0.60     34859
           3       0.31      0.77      0.45      7327
           4       0.05      0.66      0.10       329

    accuracy                       

In [8]:
ee_params = {
    'n_estimators':[30,50,70,100],
    'oob_score':[False,True]
}
clf = GridSearchCV(ee, param_grid=ee_params,cv=5,scoring='roc_auc',n_jobs=-1)
clf.fit(X_test, Y_test)


ee = imbalanced_ensemble.ensemble.EasyEnsembleClassifier(**clf.best_params_)
ee.fit(X_train, Y_train)

pred_ee = ee.predict(X_test)
y_train_pred = ee.predict(X_train[:30000])

mat_train = confusion_matrix(Y_train[:30000],y_train_pred)
mat_ee = confusion_matrix(Y_test, pred_ee)

print(f"confusion matrix :\n{mat_train}\n")
print(f"confusion matrix :\n{mat_ee}\n")
print(classification_report(Y_train[:30000],y_train_pred))
print(classification_report(Y_test, pred_ee))

del pred_ee, y_train_pred

confusion matrix :
[[    3     6     1     2]
 [ 2271 10881  9010  2354]
 [  207   806  4057   189]
 [   26    28     9   150]]

confusion matrix :
[[   13     5     7     0]
 [ 3257 15557 12699  3346]
 [  285  1175  5590   277]
 [   40    44    22   223]]

              precision    recall  f1-score   support

           1       0.00      0.25      0.00        12
           2       0.93      0.44      0.60     24516
           3       0.31      0.77      0.44      5259
           4       0.06      0.70      0.10       213

    accuracy                           0.50     30000
   macro avg       0.32      0.54      0.29     30000
weighted avg       0.81      0.50      0.57     30000

              precision    recall  f1-score   support

           1       0.00      0.52      0.01        25
           2       0.93      0.45      0.60     34859
           3       0.31      0.76      0.44      7327
           4       0.06      0.68      0.11       329

    accuracy                       

In [9]:
ee_params = {
    'n_estimators':[30,50,70,90,100],
    'oob_score':[False,True]
}
clf = GridSearchCV(ee, param_grid=ee_params,cv=5,scoring='precision',n_jobs=-1)
clf.fit(X_test, Y_test)


ee = imbalanced_ensemble.ensemble.EasyEnsembleClassifier(**clf.best_params_)
ee.fit(X_train, Y_train)

pred_ee = ee.predict(X_test)
y_train_pred = ee.predict(X_train[:30000])

mat_train = confusion_matrix(Y_train[:30000],y_train_pred)
mat_ee = confusion_matrix(Y_test, pred_ee)

print(f"confusion matrix :\n{mat_train}\n")
print(f"confusion matrix :\n{mat_ee}\n")
print(classification_report(Y_train[:30000],y_train_pred))
print(classification_report(Y_test, pred_ee))

del pred_ee, y_train_pred

confusion matrix :
[[    3     5     2     2]
 [ 2627 10020  9627  2242]
 [  238   680  4159   182]
 [   29    24    18   142]]

confusion matrix :
[[   13     6     6     0]
 [ 3628 14470 13539  3222]
 [  336   953  5767   271]
 [   50    49    32   198]]

              precision    recall  f1-score   support

           1       0.00      0.25      0.00        12
           2       0.93      0.41      0.57     24516
           3       0.30      0.79      0.44      5259
           4       0.06      0.67      0.10       213

    accuracy                           0.48     30000
   macro avg       0.32      0.53      0.28     30000
weighted avg       0.82      0.48      0.54     30000

              precision    recall  f1-score   support

           1       0.00      0.52      0.01        25
           2       0.93      0.42      0.57     34859
           3       0.30      0.79      0.43      7327
           4       0.05      0.60      0.10       329

    accuracy                       

In [10]:
ee_params = {
    'n_estimators':[30,50,70,90,100],
    'oob_score':[False,True]
}
clf = GridSearchCV(ee, param_grid=ee_params,cv=5,scoring='f1',n_jobs=-1)
clf.fit(X_test, Y_test)


ee = imbalanced_ensemble.ensemble.EasyEnsembleClassifier(**clf.best_params_)
ee.fit(X_train, Y_train)

pred_ee = ee.predict(X_test)
y_train_pred = ee.predict(X_train[:30000])

mat_train = confusion_matrix(Y_train[:30000],y_train_pred)
mat_ee = confusion_matrix(Y_test, pred_ee)

print(f"confusion matrix :\n{mat_train}\n")
print(f"confusion matrix :\n{mat_ee}\n")
print(classification_report(Y_train[:30000],y_train_pred))
print(classification_report(Y_test, pred_ee))

del pred_ee, y_train_pred

confusion matrix :
[[    3     4     3     2]
 [ 2396 10623  9069  2428]
 [  210   735  4152   162]
 [   30    18    15   150]]

confusion matrix :
[[   13     5     7     0]
 [ 3315 15215 12802  3527]
 [  285  1048  5744   250]
 [   51    36    29   213]]

              precision    recall  f1-score   support

           1       0.00      0.25      0.00        12
           2       0.93      0.43      0.59     24516
           3       0.31      0.79      0.45      5259
           4       0.05      0.70      0.10       213

    accuracy                           0.50     30000
   macro avg       0.33      0.54      0.29     30000
weighted avg       0.82      0.50      0.56     30000

              precision    recall  f1-score   support

           1       0.00      0.52      0.01        25
           2       0.93      0.44      0.59     34859
           3       0.31      0.78      0.44      7327
           4       0.05      0.65      0.10       329

    accuracy                       

BalanceCascade
=

In [15]:
# clf = imbalanced_ensemble.ensemble.BalanceCascadeClassifier(random_state=0)
# clf.fit(X_train, Y_train)

# pred_bc = clf.predict(X_test)
# y_train_pred = clf.predict(X_train[:30000])

# mat_train = confusion_matrix(Y_train[:30000],y_train_pred)
# mat_bc = confusion_matrix(Y_test, pred_bc)

# print(f"confusion matrix :\n{mat_train}\n")
# print(f"confusion matrix :\n{mat_bc}\n")
# print(classification_report(Y_train[:30000],y_train_pred))
# print(classification_report(Y_test, pred_bc))

# del pred_bc, y_train_pred