In [None]:
#구글 드라이브 연결
from google.colab import drive
drive.mount('/content/gdrive')

#코랩 환경 경로 설정 -> 자신에게 맞는 경로로 설정해주시면 됩니다
DATA_PATH = '/content/gdrive/MyDrive/지역치안공모전/data/'

Mounted at /content/gdrive


## Import & Install

In [None]:
# 코랩 기준 필요 라이브러리 설치

!pip install haversine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting haversine
  Downloading haversine-2.7.0-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: haversine
Successfully installed haversine-2.7.0


In [None]:
#Base & visualization
import pandas as pd
import random
import os
import numpy as np
import warnings
import matplotlib.pylab as plt
import seaborn as sns

#Feature engineering
import datetime
from haversine import haversine

#Sklearn module & utils
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold , KFold, train_test_split, cross_val_score, cross_validate

#Metric
from sklearn.metrics import mean_absolute_error

#Modeling
from statsmodels.tsa.arima.model import ARIMA

## Fix Seed

In [None]:
#Seed 고정
class CFG:
    SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Data Load

In [None]:
kp2020 = pd.read_csv(DATA_PATH + 'KP2020.csv', encoding = 'cp949')
kp2021 = pd.read_csv(DATA_PATH + 'KP2021.csv', encoding = 'cp949')
npa2020 = pd.read_csv(DATA_PATH + 'NPA2020.csv', encoding = 'cp949')
codeBook = pd.read_excel(DATA_PATH + 'codeBook_v3.xlsx')

In [None]:
#외부데이터(기상청)
temp2020 = pd.read_csv(DATA_PATH + '2020년기상청관측데이터.csv', encoding = 'cp949')
temp2021 = pd.read_csv(DATA_PATH + '2021년기상청관측데이터.csv', encoding = 'cp949')
temp2022 = pd.read_csv(DATA_PATH + '2022년기상청관측데이터.csv', encoding = 'cp949')
temp2023 = pd.read_csv(DATA_PATH + '2023년기상청관측데이터.csv', encoding = 'cp949')
location = pd.read_csv(DATA_PATH + '관측지점정보.csv', encoding = 'cp949')

In [None]:
import folium
import requests
import json

In [None]:
# 서울 행정구역 json raw파일(githubcontent)
r = requests.get('https://raw.githubusercontent.com/vuski/admdongkor/master/ver20230101/HangJeongDong_ver20230101.geojson')
c = r.content
seoul_geo = json.loads(c)

In [None]:
m = folium.Map(
    location=[37.559819, 126.963895],
    zoom_start=11,
    tiles='cartodbpositron'
)

folium.GeoJson(
    seoul_geo,
    name='지역구'
).add_to(m)

m

Output hidden; open in https://colab.research.google.com to view.

In [None]:
m = folium.Map([43, -102], zoom_start=3)

folium.Choropleth(
    geo_data='../data/02._us-states.json', # 경계선 좌표값이 담긴 데이터
    data=state_data, # Series or DataFrame 넣으면 된다
    columns=['State', 'Unemployment'], # DataFrame의 어떤 columns을 넣을지
    key_on='feature.id', # id 값을 가져오겠다; feature.id : feature 붙여줘야 함 (folium의 정해진 형식)
    fill_color='BuPu',
    fill_opacity=0.5, # 색 투명도
    line_opacity=0.5, # 선 투명도
    legend_name='Unemployment rate (%)' # 범례
).add_to(m)
m

In [None]:
m = folium.Map([36.5184, 127.8000], zoom_start=9.5)

In [None]:
folium.Choropleth(
    geo_data='../data/02._us-states.json', # 경계선 좌표값이 담긴 데이터
    data=state_data, # Series or DataFrame 넣으면 된다
    columns=['State', 'Unemployment'], # DataFrame의 어떤 columns을 넣을지
    key_on='feature.id', # id 값을 가져오겠다; feature.id : feature 붙여줘야 함 (folium의 정해진 형식)
    fill_color='BuPu',
    fill_opacity=0.5, # 색 투명도
    line_opacity=0.5, # 선 투명도
    legend_name='Unemployment rate (%)' # 범례
).add_to(m)
m

TypeError: ignored

## EDA

**NPA_CL : 경찰청 구분**  
**EVT_STAT_CD : 사건상태코드**  
**EVT_CL_CD : 사건종별코드**  
**RPTER_SEX : 성별**

In [None]:
codeBook

Unnamed: 0,No,컬럼명,컬럼 그룹,코드명,코드값
0,1,NPA_CL,경찰청 구분,본청,1
1,2,NPA_CL,경찰청 구분,서울청,8
2,3,NPA_CL,경찰청 구분,부산청,9
3,4,NPA_CL,경찰청 구분,대구청,10
4,5,NPA_CL,경찰청 구분,인천청,11
...,...,...,...,...,...
91,92,EVT_CL_CD,사건종별코드,재해재난,609
92,93,EVT_CL_CD,사건종별코드,위험동물,610
93,94,RPTER_SEX,성별,남성,1
94,95,RPTER_SEX,성별,여성,2


In [None]:
codeBook.query("컬럼명 == 'EVT_CL_CD'").query("코드명 == '교통사고'")

Unnamed: 0,No,컬럼명,컬럼 그룹,코드명,코드값
68,69,EVT_CL_CD,사건종별코드,교통사고,401


In [None]:
kp2021_traffic.info()

Unnamed: 0,RECV_DEPT_NM,RECV_CPLT_DM,NPA_CL,EVT_STAT_CD,EVT_CL_CD,RPTER_SEX,HPPN_PNU_ADDR,HPPN_X,HPPN_Y,SME_EVT_YN
0,대전청,21/03/07 00:07:54.000000000,13,10,401,2.0,대전광역시 서구 괴정동(괴정동) 367-40,127.371854,36.341961,
1,대전청,21/03/07 00:15:27.000000000,13,10,401,1.0,대전광역시 유성구 상대동 471-2,127.339334,36.346899,
2,충남청,21/03/07 00:13:58.000000000,19,10,401,1.0,충청남도 천안시 동남구 목천읍 운전리(목천읍) 333-1,127.230796,36.767596,
3,대전청,21/03/07 02:17:35.000000000,13,10,401,2.0,대전광역시 서구 갈마동(갈마2동) 343-30,127.370973,36.347592,
4,충남청,21/01/03 00:12:25.000000000,19,10,401,2.0,충청남도 천안시 서북구 두정동(부성2동) 633,127.130635,36.825850,
...,...,...,...,...,...,...,...,...,...,...
164577,대전청,23/01/14 17:04:36.000000000,13,10,401,2.0,대전 동구 원동 51-1,127.433184,36.329560,
164578,대전청,23/01/15 20:01:39.000000000,13,10,401,1.0,대전 서구 괴정동 88-6,127.384284,36.334507,
164579,충남청,23/01/16 21:26:56.000000000,19,10,401,1.0,충청남도 태안군 근흥면 두야리(근흥면 ) 246-2,126.262393,36.751467,
164580,대전청,23/01/14 17:35:38.000000000,13,10,401,1.0,대전광역시 대덕구 신탄진동 (행정:신탄진동 ) 129-7,127.433785,36.450725,


#Feature engineering

In [None]:
temp_all = pd.concat([temp2020,temp2021,temp2022,temp2023]).sort_values(by=["지점", "일시"]).reset_index(drop=True)

In [None]:
temp_all.지점.unique()

array([129, 133, 177, 232, 235, 236, 238, 239])

In [None]:
지점_list = [129, 133, 177, 232, 235, 236, 238, 239]
temp_fine = pd.DataFrame(columns=['지점', '지점명', '일시', '기온(°C)', '풍속(m/s)', '풍향(16방위)', '습도(%)', '증기압(hPa)','이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)', '전운량(10분위)', '시정(10m)','지면온도(°C)'])

In [None]:
for i in range(len(지점_list)):
  temp_fine = pd.concat([temp_fine,temp_all.query(f'지점=={지점_list[i]}').interpolate()])

In [None]:
# 세종시에서는 2020-03-05 이전까지 전운량측정을 안했음 -> 근처 대전시 값으로 대체
temp_fine.isnull().sum()

지점              0
지점명             0
일시              0
기온(°C)          0
풍속(m/s)         0
풍향(16방위)        0
습도(%)           0
증기압(hPa)        0
이슬점온도(°C)       0
현지기압(hPa)       0
해면기압(hPa)       0
전운량(10분위)    1547
시정(10m)         0
지면온도(°C)        0
dtype: int64

In [None]:
cloud_list = temp_fine.query('지점명=="대전"')['전운량(10분위)'][:1547].to_list()
temp_fine.loc[range(187152, 188699), '전운량(10분위)'] = cloud_list

In [None]:
temp_fine["전운량(10분위)"] = temp_fine["전운량(10분위)"].round()

In [None]:
temp_fine

Unnamed: 0,지점,지점명,일시,기온(°C),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),전운량(10분위),시정(10m),지면온도(°C)
0,129,서산,2020-01-01 00:00,-7.2,0.2,0.0,84.0,3.0,-9.4,1030.9,1034.2,9.0,2056.0,-1.4
1,129,서산,2020-01-01 01:00,-5.7,0.2,0.0,80.0,3.2,-8.6,1029.6,1032.9,8.0,2642.0,-0.9
2,129,서산,2020-01-01 02:00,-5.1,1.1,200.0,77.0,3.2,-8.5,1029.4,1032.7,9.0,1923.0,-0.9
3,129,서산,2020-01-01 03:00,-4.3,0.7,70.0,77.0,3.4,-7.7,1029.7,1033.0,9.0,550.0,-0.8
4,129,서산,2020-01-01 04:00,-4.1,1.1,50.0,85.0,3.8,-6.2,1029.6,1032.9,9.0,709.0,-0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213883,239,세종,2023-01-18 19:00,-0.6,1.1,320.0,41.0,2.4,-12.2,1014.8,1026.2,0.0,1688.0,-1.2
213884,239,세종,2023-01-18 20:00,-0.5,1.2,270.0,33.0,1.9,-14.8,1014.9,1026.3,0.0,2021.0,-1.9
213885,239,세종,2023-01-18 21:00,-1.6,0.0,0.0,35.0,1.9,-15.1,1014.8,1026.2,0.0,1892.0,-2.5
213886,239,세종,2023-01-18 22:00,-2.5,0.7,180.0,60.0,3.0,-9.2,1014.7,1026.2,0.0,1506.0,-3.2


In [None]:
location = location.query(f'지점 == {지점_list}')
location = location.drop_duplicates(subset='지점', keep='first').reset_index(drop=True)
location['위도'] = location['위도'].astype(float)

In [None]:
kp2020_traffic = kp2020.query('EVT_CL_CD == 401').reset_index(drop=True)
kp2021_traffic = kp2021.query('EVT_CL_CD == 401').reset_index(drop=True)
npa2020_traffic = npa2020.query('EVT_CL_CD == 401').reset_index(drop=True)

In [None]:
kp2020_traffic_02 = kp2020_traffic.copy()
kp2021_traffic_02 = kp2021_traffic.copy()
npa2020_traffic_02 = npa2020_traffic.copy()

In [None]:
kp2020_traffic_02 = kp2020_traffic_02.dropna(subset=['HPPN_X']).reset_index(drop=True)
kp2021_traffic_02 = kp2021_traffic_02.dropna(subset=['HPPN_X']).reset_index(drop=True)
npa2020_traffic_02 = npa2020_traffic_02.dropna(subset=['HPPN_X']).reset_index(drop=True)
npa2020_traffic_02 = npa2020_traffic_02[npa2020_traffic_02.HPPN_X != 0]

In [None]:
def add_zeros(x):
    x = str(x)
    return x.zfill(6)

npa2020_traffic_02['RECV_CPLT_TM'] = npa2020_traffic_02['RECV_CPLT_TM'].apply(add_zeros)
npa2020_traffic_02['RECV_CPLT_DM'] = npa2020_traffic_02['RECV_CPLT_DT'].astype(str).str[:4] + '-' + npa2020_traffic_02['RECV_CPLT_DT'].astype(str).str[4:6] + '-' + npa2020_traffic_02['RECV_CPLT_DT'].astype(str).str[6:8] + ' ' + npa2020_traffic_02['RECV_CPLT_TM'].astype(str).str[:2]+':00'
npa2020_traffic_02 = npa2020_traffic_02.drop(columns=['RECV_CPLT_DT','RECV_CPLT_TM','HPPN_OLD_ADDR'])

In [None]:
kp2020_traffic_02['RECV_CPLT_DM'] = '20' + kp2020_traffic_02['RECV_CPLT_DM']
kp2020_traffic_02['RECV_CPLT_DM'] = pd.to_datetime(kp2020_traffic_02['RECV_CPLT_DM'])
kp2020_traffic_02['RECV_CPLT_DM'] = kp2020_traffic_02['RECV_CPLT_DM'].dt.strftime('%Y-%m-%d %H')
kp2020_traffic_02['RECV_CPLT_DM'] = kp2020_traffic_02['RECV_CPLT_DM'] + ':00'

In [None]:
kp2021_traffic_02['RECV_CPLT_DM'] = '20' + kp2021_traffic_02['RECV_CPLT_DM']
kp2021_traffic_02['RECV_CPLT_DM'] = pd.to_datetime(kp2021_traffic_02['RECV_CPLT_DM'])
kp2021_traffic_02['RECV_CPLT_DM'] = kp2021_traffic_02['RECV_CPLT_DM'].dt.strftime('%Y-%m-%d %H')
kp2021_traffic_02['RECV_CPLT_DM'] = kp2021_traffic_02['RECV_CPLT_DM'] + ':00'

In [None]:
kp_all = pd.concat([kp2020_traffic_02,kp2021_traffic_02])
kp_all = kp_all.drop(columns=['RECV_DEPT_NM','HPPN_PNU_ADDR'])

In [None]:
traffic_all = pd.concat([npa2020_traffic_02,kp_all]).reset_index(drop=True)

In [None]:
traffic_all['지점'] = traffic_all.apply(lambda x: location.loc[np.argmin([haversine((x.HPPN_Y, x.HPPN_X), (loc.위도, loc.경도), unit='km') for i, loc in location.iterrows()]), '지점'], axis=1)

In [None]:
Last = traffic_all[['지점','RECV_CPLT_DM']].sort_values(by=["지점", "RECV_CPLT_DM"]).reset_index(drop=True)

In [None]:
#사고유무

In [None]:
Last['사고유무'] = 1
Last.drop_duplicates(subset=['지점','RECV_CPLT_DM']).reset_index(drop=True)
Last.rename(columns = {"RECV_CPLT_DM": "일시"}, inplace = True)
X = pd.merge(temp_fine, Last, on=['지점', '일시'], how='left')
X['사고유무'] = X['사고유무'].fillna(0)

In [None]:
X

Unnamed: 0,지점,지점명,일시,기온(°C),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),전운량(10분위),시정(10m),지면온도(°C),사고유무
0,129,서산,2020-01-01 00:00,-7.2,0.2,0.0,84.0,3.0,-9.4,1030.9,1034.2,9.0,2056.0,-1.4,1.0
1,129,서산,2020-01-01 01:00,-5.7,0.2,0.0,80.0,3.2,-8.6,1029.6,1032.9,8.0,2642.0,-0.9,1.0
2,129,서산,2020-01-01 01:00,-5.7,0.2,0.0,80.0,3.2,-8.6,1029.6,1032.9,8.0,2642.0,-0.9,1.0
3,129,서산,2020-01-01 01:00,-5.7,0.2,0.0,80.0,3.2,-8.6,1029.6,1032.9,8.0,2642.0,-0.9,1.0
4,129,서산,2020-01-01 02:00,-5.1,1.1,200.0,77.0,3.2,-8.5,1029.4,1032.7,9.0,1923.0,-0.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376389,239,세종,2023-01-18 20:00,-0.5,1.2,270.0,33.0,1.9,-14.8,1014.9,1026.3,0.0,2021.0,-1.9,1.0
376390,239,세종,2023-01-18 21:00,-1.6,0.0,0.0,35.0,1.9,-15.1,1014.8,1026.2,0.0,1892.0,-2.5,0.0
376391,239,세종,2023-01-18 22:00,-2.5,0.7,180.0,60.0,3.0,-9.2,1014.7,1026.2,0.0,1506.0,-3.2,1.0
376392,239,세종,2023-01-18 22:00,-2.5,0.7,180.0,60.0,3.0,-9.2,1014.7,1026.2,0.0,1506.0,-3.2,1.0


In [None]:
X.사고유무.value_counts()

1.0    259312
0.0    117082
Name: 사고유무, dtype: int64

In [None]:
"""
# 중복횟수
Last['중복횟수'] = Last.groupby(['지점', 'RECV_CPLT_DM']).cumcount() + 1
Last = Last.drop_duplicates(subset=['지점','RECV_CPLT_DM'], keep='last').reset_index(drop=True)
Last.rename(columns = {"RECV_CPLT_DM": "일시"}, inplace = True)
X = pd.merge(temp_fine, Last, on=['지점', '일시'], how='left')
X['중복횟수'] = X['중복횟수'].fillna(0)
"""

# 카카오API를 사용하여 좌표->주소 변환
import requests, json, pprint

def get_address(lat, lng):
    url = "https://dapi.kakao.com/v2/local/geo/coord2regioncode.json?x="+lng+"&y="+lat
    headers = {"Authorization": "KakaoAK 4d9507b0cde5c0f3a093a4fc675bcd9d"}
    api_json = requests.get(url, headers=headers)
    full_address = json.loads(api_json.text)

    return full_address

kp_20_법정 = kp2020_traffic_02.apply(lambda row: pd.Series(get_address(f'{row.HPPN_Y}', f'{row.HPPN_X}')).get('documents')[0], axis=1)
kp_20_행정 = kp2020_traffic_02.apply(lambda row: pd.Series(get_address(f'{row.HPPN_Y}', f'{row.HPPN_X}')).get('documents')[1], axis=1)
kp_21_법정 = kp2021_traffic_02.apply(lambda row: pd.Series(get_address(f'{row.HPPN_Y}', f'{row.HPPN_X}')).get('documents')[0], axis=1)
kp_21_행정 = kp2021_traffic_02.apply(lambda row: pd.Series(get_address(f'{row.HPPN_Y}', f'{row.HPPN_X}')).get('documents')[1], axis=1)
np_20_법정 = npa2020_traffic_02.apply(lambda row: pd.Series(get_address(f'{row.HPPN_Y}', f'{row.HPPN_X}')).get('documents')[0], axis=1)
np_20_행정 = npa2020_traffic_02.apply(lambda row: pd.Series(get_address(f'{row.HPPN_Y}', f'{row.HPPN_X}')).get('documents')[1], axis=1)

for df, name in [(kp_20_법정, '법정'), (kp_20_행정, '행정')]:
    for i in range(1, 5):
        kp2020_traffic_02[f'region_{i}_{name}'] = df.apply(lambda x: x[f'region_{i}depth_name'])
for df, name in [(kp_21_법정, '법정'), (kp_21_행정, '행정')]:
    for i in range(1, 5):
        kp2021_traffic_02[f'region_{i}_{name}'] = df.apply(lambda x: x[f'region_{i}depth_name'])
for df, name in [(np_20_법정, '법정'), (np_20_행정, '행정')]:
    for i in range(1, 5):
        npa2020_traffic_02[f'region_{i}_{name}'] = df.apply(lambda x: x[f'region_{i}depth_name'])

In [None]:
kp2020_traffic_02

Unnamed: 0,RECV_DEPT_NM,RECV_CPLT_DM,NPA_CL,EVT_STAT_CD,EVT_CL_CD,RPTER_SEX,HPPN_PNU_ADDR,HPPN_X,HPPN_Y,SME_EVT_YN,region_1_법정,region_2_법정,region_3_법정,region_4_법정,region_1_행정,region_2_행정,region_3_행정,region_4_행정
0,충남청,20/12/01 08:17:50.000000000,19,10,401,2.0,충청남도 천안시 서북구 성정동(행정:성정2동) 1259,127.13716,36.826718,,충청남도,천안시 서북구,성정동,,충청남도,천안시 서북구,성정2동,
1,대전청,20/12/01 07:08:44.000000000,13,10,401,1.0,대전광역시 중구 석교동(석교동) 85-14,127.447545,36.306724,,대전광역시,중구,석교동,,대전광역시,중구,석교동,
2,대전청,20/12/01 07:09:20.000000000,13,10,401,2.0,대전광역시 중구 석교동(행정:석교동) 88-11,127.447175,36.306694,,대전광역시,중구,석교동,,대전광역시,중구,석교동,
3,충남청,20/12/01 01:36:38.000000000,19,10,401,1.0,충청남도 서산시 대산읍 대로리(행정:대산읍) 216-3,126.41229,36.967612,,충청남도,서산시,대산읍,대로리,충청남도,서산시,대산읍,
4,대전청,20/12/01 01:39:03.000000000,13,10,401,1.0,대전광역시 동구 대동(행정:대동) 355-14,127.44739,36.324597,,대전광역시,동구,대동,,대전광역시,동구,대동,
5,충남청,20/12/01 02:23:03.000000000,19,10,401,1.0,충청남도 아산시 둔포면 송용리(행정:둔포면) 71-68,127.032318,36.92151,,충청남도,아산시,둔포면,송용리,충청남도,아산시,둔포면,
6,충남청,20/12/01 02:24:26.000000000,19,10,401,1.0,충청남도 서산시 읍내동(부춘동) 733-2,126.438445,36.779188,,충청남도,서산시,읍내동,,충청남도,서산시,부춘동,
7,충남청,20/12/01 01:37:37.000000000,19,10,401,3.0,충청남도 서산시 대산읍 대로리(대산읍) 216-3,126.412245,36.967579,,충청남도,서산시,대산읍,대로리,충청남도,서산시,대산읍,
8,충남청,20/12/01 01:38:31.000000000,19,10,401,1.0,충청남도 서산시 대산읍 대로리(행정:대산읍) 216-3,126.41229,36.967612,,충청남도,서산시,대산읍,대로리,충청남도,서산시,대산읍,
9,충남청,20/12/01 01:29:19.000000000,19,10,401,1.0,충청남도 아산시 실옥동(온양4동) 131-3,126.992778,36.794167,,충청남도,아산시,실옥동,,충청남도,아산시,온양4동,


## Modeling & Ensemble

In [None]:
#Submission file 준비
submit = pd.read_csv(DATA_PATH  + 'sample_submission.csv')

In [None]:
#Model Selection -> 여러 모델링 실험결과 종류가 다른 모델 여러개를 앙상블 하는 것이 좋다 판단함
models = [
    ('bag', BaggingClassifier(random_state=CFG.SEED)),
    ('dt', DecisionTreeClassifier(random_state=CFG.SEED)),
    ('rc', RidgeClassifier(random_state=CFG.SEED)),
    ('xgb', XGBClassifier(random_state=CFG.SEED)),
    ('lgb', LGBMClassifier(random_state=CFG.SEED)),
    ('gb', GradientBoostingClassifier(random_state=CFG.SEED)),
    ('svc', SVC(random_state=CFG.SEED)),
    ('rcc', RidgeClassifierCV()),
    ('rf', RandomForestClassifier(random_state=CFG.SEED))
]

In [None]:
#최종모델은 Votingclassifier 사용하여 ensemble -> 제출결과 public score기준 XGBClassifier와 RandomForestClassifier 성능이 좋아 가중치를 주었음
best_model  = VotingClassifier(models, voting='hard', weights=[1,1,1,2,1,1,1,1,2])
best_model.fit(train_x,train_y)

VotingClassifier(estimators=[('bag', BaggingClassifier(random_state=26)),
                             ('dt', DecisionTreeClassifier(random_state=26)),
                             ('rc', RidgeClassifier(random_state=26)),
                             ('xgb', XGBClassifier(random_state=26)),
                             ('lgb', LGBMClassifier(random_state=26)),
                             ('gb',
                              GradientBoostingClassifier(random_state=26)),
                             ('svc', SVC(random_state=26)),
                             ('rcc',
                              RidgeClassifierCV(alphas=array([ 0.1,  1. , 10. ]))),
                             ('rf', RandomForestClassifier(random_state=26))],
                 weights=[1, 1, 1, 2, 1, 1, 1, 1, 2])

## Submit

In [None]:
#test predict
pred = class_le.inverse_transform(best_model.predict(test_x))
submit['class'] = pred

In [None]:
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [None]:
# submit.to_csv('Fine.csv',index=False)