In [1]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
undersample = RandomUnderSampler(sampling_strategy='majority')
smt = SMOTETomek(sampling_strategy='all')
tl = TomekLinks(sampling_strategy='all')
np.random.seed(1234)
df = pd.read_pickle('../pkl/0416.pkl')

In [2]:
df = df[~df['음주운전'].isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59589 entries, A000002 to A074483
Data columns (total 39 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   사고지역       59589 non-null  object 
 1   사망자수       59589 non-null  int64  
 2   중상자수       59589 non-null  int64  
 3   경상자수       59589 non-null  int64  
 4   부상신고자수     59589 non-null  int64  
 5   사고유형       59589 non-null  object 
 6   법규위반       59589 non-null  object 
 7   노면상태       59589 non-null  int64  
 8   도로형태       59589 non-null  object 
 9   가해운전자차종    59589 non-null  int64  
 10  가해운전자성별    59589 non-null  int64  
 11  가해운전자연령    58154 non-null  float64
 12  가해운전자상해정도  59589 non-null  int64  
 13  피해운전자차종    59589 non-null  int64  
 14  피해운전자성별    59589 non-null  int64  
 15  피해운전자연령    57661 non-null  float64
 16  피해운전자상해정도  59589 non-null  int64  
 17  음주운전       59589 non-null  float64
 18  렌터카사고      59589 non-null  float64
 19  무단횡단사고     59589 non-null  float64
 20  무면허

In [3]:
cat_list = df.select_dtypes(['object']).columns.tolist()
num_list = df.select_dtypes(['number']).drop(columns='음주운전').columns.tolist()

In [4]:
cat_list

['사고지역', '사고유형', '법규위반', '도로형태', '사고유형_대범주', '도로형태_대범주']

In [5]:
num_list

['사망자수',
 '중상자수',
 '경상자수',
 '부상신고자수',
 '노면상태',
 '가해운전자차종',
 '가해운전자성별',
 '가해운전자연령',
 '가해운전자상해정도',
 '피해운전자차종',
 '피해운전자성별',
 '피해운전자연령',
 '피해운전자상해정도',
 '렌터카사고',
 '무단횡단사고',
 '무면허',
 '어린이보호구역사고',
 '택시사고',
 '터널사고',
 '분기',
 '월',
 '시간',
 '음주운전_시간',
 '단독사고',
 '보행자사고',
 '요일_0',
 '요일_1',
 '요일_2',
 '요일_3',
 '요일_4',
 '요일_5',
 '요일_6']

In [6]:
from pycaret import classification as cls
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid', font_scale=1)
sns.set_palette('Set2', n_colors=10)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

In [7]:
cls.setup(data=df, target='음주운전', train_size=0.7,
          categorical_features = cat_list,
          numeric_features=num_list,
#           imputation_type='iterative',
#           categorical_iterative_imputer = 'rf',
#           numeric_iterative_imputer='rf',
#           iterative_imputation_iters=3,
#           pca=True,
#           pca_method='kernel',
          remove_outliers=True,
          outliers_threshold=0.05,
          remove_multicollinearity=True,
          create_clusters=True,
          cluster_iter=10,
          fix_imbalance=True,
          fix_imbalance_method= tl,
          fold = 3,
          normalize=True,
          normalize_method='robust',
          session_id = 1234)

Unnamed: 0,Description,Value
0,session_id,1234
1,Target,음주운전
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(59589, 39)"
5,Missing Values,True
6,Numeric Features,32
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


(None,
 TomekLinks(n_jobs=None, sampling_strategy='all'),
          사망자수  중상자수  경상자수  부상신고자수  노면상태  가해운전자차종  가해운전자성별   가해운전자연령  \
 my_id                                                                 
 A034623   0.0   0.0   0.0     0.0   0.0      0.0      0.0 -0.166667   
 A063908   0.0   0.0   0.0     0.0   0.0      0.0      0.0 -0.500000   
 A047479   0.0   1.0  -1.0     0.0   0.0     -1.0      0.0  0.000000   
 A065454   0.0   0.0   0.0     0.0   0.0     -1.0      0.0 -1.625000   
 A049010   0.0   0.0   0.0     0.0   0.0     -1.0      0.0  0.083333   
 ...       ...   ...   ...     ...   ...      ...      ...       ...   
 A012129   0.0   0.0   1.0     0.0   0.0      0.0      0.0  0.166667   
 A016113   0.0   0.0   0.0     0.0   0.0     -1.0      0.0 -1.083333   
 A021288   0.0   0.0   0.0     0.0   0.0     -1.0      0.0 -0.375000   
 A050843   0.0   0.0  -1.0     1.0   0.0     -1.0      0.0 -0.708333   
 A039202   0.0   0.0   0.0     0.0   0.0      0.0      0.0 -0.916667   
 
    

In [8]:
from sklearn.metrics import average_precision_score
cls.add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')

Name                                                               APC
Display Name                                                       APC
Score Function       <function average_precision_score at 0x7ff031c...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: apc, dtype: object

In [9]:
from pycaret.classification import *
model = create_model('catboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9487,0.9135,0.2779,0.6272,0.3851,0.3625,0.3956,0.4805
1,0.9486,0.9099,0.2805,0.6221,0.3866,0.3638,0.3955,0.4807
2,0.9518,0.9104,0.3028,0.6875,0.4204,0.3992,0.4361,0.4998
Mean,0.9497,0.9113,0.287,0.6456,0.3974,0.3751,0.4091,0.487
Std,0.0015,0.0016,0.0112,0.0297,0.0163,0.017,0.0191,0.0091


In [10]:
# top_model = cls.compare_models(n_select=3, sort='apc')

In [11]:
# from pycaret.classification import *

In [12]:
# blended = blend_models(estimator_list = top_model)

In [13]:
# print(blended)

In [14]:
# print(top_model)

In [15]:
# cls.evaluate_model(blended)

In [16]:
# cls.evaluate_model(top_model)
cls.evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [17]:
# final_model = cls.finalize_model(top_model)
final_model = cls.finalize_model(model)
# final_model = cls.finalize_model(blended)

In [18]:
cls.save_model(final_model, 'automl_ver01')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['사고지역', '사고유형',
                                                             '법규위반', '도로형태',
                                                             '사고유형_대범주',
                                                             '도로형태_대범주'],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=['사망자수', '중상자수',
                                                           '경상자수', '부상신고자수',
                                                           '노면상태', '가해운전자차종',
                                                           '가해운전자성별', '가해운전자연령',
                                                           '가해운전자상해정도',
                                                           '피해운전

In [19]:
te = pd.read_pickle('../pkl/0416.pkl')
te = te[te['음주운전'].isnull()]

In [20]:
te

Unnamed: 0_level_0,사고지역,사망자수,중상자수,경상자수,부상신고자수,사고유형,법규위반,노면상태,도로형태,가해운전자차종,...,단독사고,보행자사고,도로형태_대범주,요일_0,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6
my_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A051970,서초구,0,1,0,0,차대차 - 측면충돌,신호위반,0,교차로 - 교차로안,1,...,0,0,교차로,0,0,0,0,0,1,0
A036817,강남구,0,0,2,0,차대차 - 기타,안전거리미확보,0,단일로 - 기타,1,...,0,0,단일로,0,0,1,0,0,0,0
A021606,동작구,0,0,3,0,차대차 - 정면충돌,신호위반,0,교차로 - 교차로부근,1,...,0,0,교차로,0,1,0,0,0,0,0
A018391,강동구,0,0,2,0,차대차 - 추돌,안전운전불이행,0,단일로 - 기타,1,...,0,0,단일로,0,0,1,0,0,0,0
A002771,은평구,0,0,1,0,차대사람 - 기타,안전운전불이행,0,교차로 - 교차로안,1,...,0,1,교차로,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A043049,중랑구,0,0,1,0,차대사람 - 횡단중,보행자보호의무위반,0,교차로 - 교차로부근,1,...,0,1,교차로,0,1,0,0,0,0,0
A047882,강동구,0,1,0,0,차대차 - 측면충돌,안전운전불이행,0,단일로 - 기타,1,...,0,0,단일로,0,0,0,0,1,0,0
A057160,용산구,0,0,3,0,차대차 - 추돌,안전운전불이행,0,교차로 - 교차로부근,1,...,0,0,교차로,0,0,0,0,0,0,1
A051261,중랑구,0,0,1,0,차대사람 - 횡단중,보행자보호의무위반,0,단일로 - 기타,1,...,0,1,단일로,0,0,0,0,1,0,0


In [21]:
pred = cls.predict_model(final_model, data=te, raw_score=True)
pred.head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
0,CatBoost Classifier,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,사고지역,사망자수,중상자수,경상자수,부상신고자수,사고유형,법규위반,노면상태,도로형태,가해운전자차종,...,요일_0,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,Label,Score_0.0,Score_1.0
my_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A051970,서초구,0,1,0,0,차대차 - 측면충돌,신호위반,0,교차로 - 교차로안,1,...,0,0,0,0,0,1,0,0.0,0.9287,0.0713
A036817,강남구,0,0,2,0,차대차 - 기타,안전거리미확보,0,단일로 - 기타,1,...,0,0,1,0,0,0,0,0.0,0.8898,0.1102
A021606,동작구,0,0,3,0,차대차 - 정면충돌,신호위반,0,교차로 - 교차로부근,1,...,0,1,0,0,0,0,0,0.0,0.6658,0.3342
A018391,강동구,0,0,2,0,차대차 - 추돌,안전운전불이행,0,단일로 - 기타,1,...,0,0,1,0,0,0,0,0.0,0.8766,0.1234
A002771,은평구,0,0,1,0,차대사람 - 기타,안전운전불이행,0,교차로 - 교차로안,1,...,0,0,1,0,0,0,0,0.0,0.9882,0.0118


In [22]:
# pred = cls.predict_model(blended, data=te, raw_score=True)
# pred.head()

In [23]:
pred = pred[['Score_1.0']].reset_index()
pred.columns = ['my_id','prob']
pred.head()

Unnamed: 0,my_id,prob
0,A051970,0.0713
1,A036817,0.1102
2,A021606,0.3342
3,A018391,0.1234
4,A002771,0.0118


In [24]:
pred.to_csv('../submission/submission_0416_5.csv', index=False)