# Library

In [None]:
!pip install anaconda

In [None]:
!pip install folium

In [None]:
# 최적 파라미터를 범위내에서 찾아주는 라이브러리로써 캐글에서 인기 폭발이라고 한다.
!pip install optuna

In [None]:
!pip install catboost

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
#한글폰트 설치. 런타임재사용 필요

# Google_Drive 연동 및 데이터 불러오기.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#구글드라이브연동

In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

plt.style.use('seaborn')
sns.set(font_scale=1)
plt.rc('font', family='NanumBarunGothic') 
plt.rcParams['font.family'] = 'NanumGothic'
#한글폰트

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
csv_to_parquet('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/train.csv','train')
csv_to_parquet('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/test.csv','test')

# 22sec #파일경로

In [None]:
train=pd.read_parquet('train.parquet')
test=pd.read_parquet('test.parquet')

In [None]:
train.road_rating.unique()

In [None]:
train1=pd.read_parquet('train.parquet')
test1=pd.read_parquet('test.parquet')

In [None]:
train['base_hour']=train1.base_hour
test['base_hour']=test1.base_hour

In [None]:
train.head(10)

# Description
 
- 470만 1217개의 행으로 구성되어 있고, 281개의 일수에서의 요일 값과 24시간 각 시간, 도로 사용여부(0,1), 차선수는 3차선이 최대입니다. 도로등급은 106, 103, 107로 제주도에는 3가지의 등급 밖에 없는것으로 여겨집니다.


- 도로명은 61가지가 있으며, 중용구간(2개이상의 노선이 공동으로 사용하는 공간)일수도 있고 아닐 수 도 있습니다. 연결로코드 는 0혹은 103이 될 수 있습니다. 속도제한은 6가지 제한이 있으며, 30-80까지 10단위로 총 6개가 존재합니다. 차량제한(drop)은 없지만, 하중제한은 [32400.,     0., 43200., 50000.]이렇게 존재하고, 높이제한(drop)도 없습니다. 


-  도로유형은 3번타입 아니면 0번타입 두가지입니다. 시작지점명, 도착지점명은 487곳이고, 시작위경도는 586 * (x,y) 시작, 도착 지점의 회전가능여부도 존재합니다.


# Column Description

        id	아이디
        base_date	날짜
        day_of_week	요일
        base_hour	시간대
        road_in_use	도로사용여부
        lane_count	차로수
        road_rating	도로등급
        multi_linked	중용구간 여부
        connect_code	연결로 코드
        maximum_speed_limit	최고속도제한
        weight_restricted	통과제한하중
        hight_restricted	통과제한높이
        road_type	도로유형
        start_latitude	시작지점의 위도
        start_longitude	시작지점의 경도
        start_turn_restricted	시작 지점의 회전제한 유무
        end_latitude	도착지점의 위도
        end_longitude	도착지점의 경도
        end_turn_restricted	도작지점의 회전제한 유무
        road_name	도로명
        start_node_name	시작지점명
        end_node_name	도착지점명
        vehicle_restricted	통과제한차량
        target	평균속도(km) 

# Numeric vs Categorical
틀린것 있을 수 도 있음. ㅋ

        Numeric : 

        base_date ,base_hour lane_count , maximum_speed_limit, weight_restricted, 
        height_restricted , start_latitude, start_longitude, end_latitude, end_longitude

        Categorical :

        id, day_of_week, road_in_use, road_rating, multi_linked, connect_code,
        start_turn_restricted, road_type, end_turn_restricted, road_name,
        start_node_name, end_node_name, vehicle_restricted

        Target : target ( 평균속도)

# EDA
- 결측치는 존재하지 않습니다.

In [None]:
train.isnull().sum() # wow 제주 대신 train으로 변경할것.

-상관계수 분석하기.

In [None]:
data = train.corr()
# 그림 사이즈 지정
fig, ax = plt.subplots( figsize=(30,30) )

# 삼각형 마스크를 만든다(위 쪽 삼각형에 True, 아래 삼각형에 False)
mask = np.zeros_like(data, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# 히트맵을 그린다
sns.heatmap(data, 
            cmap = 'RdYlBu_r', 
            annot = True,   # 실제 값을 표시한다
            mask=mask,      # 표시하지 않을 마스크 부분을 지정한다
            linewidths=.5,  # 경계면 실선으로 구분하기
            cbar_kws={"shrink": .5},# 컬러바 크기 절반으로 줄이기
            vmin = -1,vmax = 1   # 컬러바 범위 -1 ~ 1
           )
plt.title('특성별 상관관계\n',fontsize=20)  
plt.show()

In [None]:
sns.histplot(data=train['maximum_speed_limit'])

In [None]:
sns.boxplot(data=train,x='target')
plt.title('target')

In [None]:
sns.countplot(data=train, x=train['day_of_week']);

In [None]:
sns.countplot(data=train, x=train['base_hour']);

In [None]:
sns.countplot(data=train, x=train['road_in_use']);

train['road_in_use'].value_counts() # 개수

print('도로를 사용하지 않을 때 평균속도의 mean',train[train['road_in_use']==0]['target'].mean())

print('도로를 사용할때 평균속도의 mean',train[train['road_in_use']==1]['target'].mean())

In [None]:
# 컬럼별 target분포 히스토그램.
def value_hist(df, col, target='target'):

  for value in df[col].unique():

    cond = (df[col]==value)
    cond_df = df.loc[cond]

    print(f'{value} 데이터 갯수 : {cond_df.shape[0]}')

    

    fig, ax = plt.subplots(ncols=2, figsize=(13, 6))

    sns.histplot(data=cond_df, x='target', ax=ax[0])
    ax[0].set_title(f'{col}의 {value} 히스토그램')
    ax[0].set_xticks(range(0,int(df[target].max()+1)), 20)

    sns.boxplot(data=cond_df, x='target', ax=ax[1])
    ax[1].set_title(f'{col}의 {value} Box Plot')
    
    plt.show()

In [None]:
train.columns

In [None]:
train.road_name.unique()

In [None]:
value_hist(train,'day_of_week') #요일 별 target 히스토그램.

In [None]:
value_hist(train,'base_hour')

In [None]:
value_hist(train,'road_rating')

In [None]:
value_hist(train,'lane_count')

In [None]:
value_hist(train,'end_turn_restricted')

In [None]:
value_hist(train,'start_turn_restricted')

In [None]:
train_road103=train[train.road_rating==103]
value_hist(train_road103,'base_hour')

- 시간대별 target 분포도

In [None]:
import mglearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#sns.histplot(train.target.loc[train[train['base_hour']<=6].index])

X=train[train['base_hour']<=6][['start_latitude','start_longitude','target']]

kmeans = KMeans(n_clusters=5)
kmeans.fit(X[['target']])
print(silhouette_score(X,kmeans.labels_))
mglearn.discrete_scatter(X['start_longitude'],X['start_latitude'],kmeans.labels_,markers='o',s=3,)
plt.title('before 6am target_count')

In [None]:
sns.histplot(y.loc[train[(train['base_hour']>6) &(train['base_hour']<=12)].index])
plt.title('before 6am to 12pm target_count')

In [None]:
sns.histplot(y.loc[train[(train['base_hour']>12) &(train['base_hour']<=18)].index])
plt.title('before 12pm to 18pm target_count')

In [None]:
sns.histplot(y.loc[train[(train['base_hour']>18) &(train['base_hour']<=24)].index])
plt.title('before 18pm to 24pm target_count')

In [None]:
train.base_date.describe()

In [None]:
test.base_date.describe()

In [None]:
value_hist(train,'base_date')

In [None]:
days=train['base_date'].dt.day_of_week
print(days)

In [None]:
sns.displot(train.distance)

In [None]:
train[train.road_name=='-']

#시작위경도를 train의 target값 별 분포를 통해 군집 개수를 정하기.

In [None]:
!pip install mglearn

In [None]:
pip install --upgrade joblib==1.1.0

In [None]:
#import mglearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
temp=train.groupby(['start_latitude','start_longitude','end_latitude','end_longitude','road_rating'])[['target']].median().reset_index()

X=train[['start_latitude','start_longitude','end_latitude','end_longitude','road_rating','target']]

kmeans = KMeans(n_clusters=5,random_state=5)
kmeans.fit(X)
print(silhouette_score(X,kmeans.labels_))
#mglearn.discrete_scatter(temp['start_longitude'],temp['start_latitude'],kmeans.labels_,markers='o',s=3,)
#mglearn.discrete_scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],[0,1,2,3,4],markers='^',markeredgewidth=2)
plt.legend()
plt.xlabel('latitude')
plt.ylabel('longitude')


In [None]:
temp.shape

In [None]:
temp['c_label']=kmeans.labels_

In [None]:
temp[temp['c_label']==2]['target'].mean()

In [None]:
train['c_label']=0

In [None]:
test['c_label']=0

In [None]:
test.c_label.nunique()

In [None]:
temp.road_type.unique()

In [None]:
train.maximum_speed_limit

In [None]:
train.start_latitude.nunique()

In [None]:
temp1=test.groupby(['start_latitude','start_longitude','end_latitude','end_longitude','road_name','road_type','road_rating','distance']).size().reset_index(name='freq')
temp1

temp1.to_csv('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/test_val.csv')

In [None]:
temp.c_label.nunique()

In [None]:
temp1=test.groupby(['start_latitude','start_longitude']).size().reset_index(name='freq')
temp1.freq()



In [None]:
import folium

jeju_map = folium.Map(location=[33.2434317486804,126.422025484805], zoom_start=6)

df_cities = pd.DataFrame({'시작위도':temp.start_latitude, '시작경도':temp.start_longitude,'도착위도':temp.end_latitude, '도착경도':temp.end_longitude,'도로rating' : temp.road_rating,'도로등급': temp.road_type})

df_cities

In [None]:
for i in range(len(df_cities)):
    folium.Marker([df_cities.iloc[i][0], df_cities.iloc[i][1]], popup=[df_cities.index[i],df_cities.iloc[i][4]],icon=folium.Icon(color='blue')).add_to(jeju_map)
    folium.Marker([df_cities.iloc[i][2], df_cities.iloc[i][3]], popup=[df_cities.index[i],df_cities.iloc[i][4]],icon=folium.Icon(color='red')).add_to(jeju_map)


In [None]:
jeju_map.save('jeju.html')

In [None]:
train.connect_code.unique()

#필요없는 열 삭제.

In [None]:
print('train최소',train.base_date.min())
print('train최대',train.base_date.max())

print('test최소',test.base_date.min())
print('test최대',test.base_date.max())


In [None]:
train

In [None]:
train['base_date']= pd.to_datetime(train['base_date'],format='%Y%m%d')
#datetime 형식으로 변경 train

In [None]:
test['base_date']= pd.to_datetime(test['base_date'],format='%Y%m%d')
#datetime 형식으로 변경 test

In [None]:
train['year']=train['base_date'].dt.year
train['month']=train['base_date'].dt.month
train['day']=train['base_date'].dt.day
# 날짜를 연 월 일로 분리.
#train=train.drop(['base_date'],axis=1)
# base_date컬럼삭제.


In [None]:
test['year']=test['base_date'].dt.year
test['month']=test['base_date'].dt.month
test['day']=test['base_date'].dt.day
# test도 동일하게 적용.
#test=test.drop(['base_date'],axis=1)


In [None]:
train=train.drop(['base_date'],axis=1)

In [None]:
test=test.drop(['base_date'],axis=1)

In [None]:
test.head()

# 시작위경도와 도착위경도의 유클라디안 거리를 구해 새로운 컬럼으로 설정하기.

In [None]:
import math

def euc_dist(df):
  df['distance']=(((df['end_latitude']-df['start_latitude'])**2)+((df['end_longitude']-df['start_longitude'])**2))**(1/2)

euc_dist(train)
euc_dist(test)



In [None]:
train['peak_c']=0
test['peak_c']=0

In [None]:
train[train.base_hour.between(5,7)].base_hour.unique()

In [None]:
train['peak_c'][train[train.base_hour.between( 5 , 6)].index]=1
test['peak_c'][test[test.base_hour.between( 5 , 6)].index]=1

train['peak_c'][train[train.base_hour.between( 18 , 0)].index]=1
test['peak_c'][test[test.base_hour.between( 18 , 0)].index]=1


train['peak_c'][train[train.base_hour.between( 1 , 4)].index]=2
test['peak_c'][test[test.base_hour.between( 1 , 4)].index]=2

In [None]:
test.peak_c.unique()

# 휴일 및 공휴일 컬럼생성하기.

In [None]:
t21_off = ['2021-09-20','2021-09-21','2021-09-22','2022-01-01','2022-01-29','2022-01-30','2022-01-31','2022-02-01',
'2022-02-02','2022-03-01','2022-03-09','2022-05-05','2022-05-09','2022-06-01',
'2022-06-06']

test_t22_off = ['2022-08-06','2022-08-07','2022-08-13','2022-08-14','2022-08-15'
,'2022-08-20','2022-08-21','2022-08-27']

train['off_day']=0
test['off_day']=0

In [None]:
for i in t21_off:
  dtt=pd.to_datetime(i,format='%Y-%m-%d')
  year =dtt.year
  month = dtt.month
  day = dtt.day
  v= (train['year']==year)&(train['month']==month)&(train['day']==day)
  train.loc[v,'off_day'] = 1


for i in test_t22_off :
  dtt=pd.to_datetime(i,format='%Y-%m-%d')
  year =dtt.year
  month = dtt.month
  day = dtt.day
  v= (test['year']==year)&(test['month']==month)&(test['day']==day)

  test.loc[v,'off_day'] = 1





   

# 도로 만들기

In [None]:
train.end_latitude.nunique()

In [None]:
temp=train.groupby(['start_latitude','start_longitude','end_latitude','end_longitude']).size().reset_index(name='freq')



In [None]:
temp['avg_lat']= (temp['start_latitude']+temp['end_latitude'])/2.0
temp['avg_long']= (temp['start_longitude']+temp['end_longitude'])/2.0


In [None]:
temp

In [None]:
train[train['target']<10]

In [None]:
train['c_label']=0
test['c_label']=0

In [None]:
temp

In [None]:
#train['c_label'][train[train['weight_restricted']==32400.].index]=1


for sla,slo,ela,elo,label in zip(temp['start_latitude'],temp['start_longitude'],temp['end_latitude'],temp['end_longitude'],temp['c_label']) :
  temper = label
  train['c_label'][train[ ((train['start_latitude']==sla) & (train['start_longitude']==slo) & (train['end_latitude']==ela )&(train['end_longitude']==elo)) ].index]=temper
  test['c_label'][test[ ((test['start_latitude']==sla) & (test['start_longitude']==slo) & (test['end_latitude']==ela )&(test['end_longitude']==elo)) ].index]=temper
#46sec
  


In [None]:
train.c_label.nunique()

In [None]:
train.road_num.max()

# Preprocessing

# Labeling

In [None]:
cate_val = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in cate_val :
  enc = LabelEncoder()
  enc=enc.fit(train[i])
  train[i]=enc.transform(train[i])
  for label in np.unique(test[i]):
        if label not in enc.classes_: 
            enc.classes_ = np.append(enc.classes_, label)
  test[i]=enc.transform(test[i]) # data leakage 주의
# 한글로 되어있는 열 라벨링

In [None]:
train.dtypes


In [None]:
train=train.drop(['base_hour'],axis=1)
test=test.drop(['base_hour'],axis=1)

----------------------------------------------------

# Try 1 Day를 살리고, 요일컬럼을 삭제.

In [None]:
train=train.drop(['day_of_week'],axis=1)
test=test.drop(['day_of_week'],axis=1)

# Try 2 도착 위경도 데이터만을 이용하여 예측
근거 : 시작위도 경도 값은 도착 위경도 값이랑 상관관계가 높아 과적합의 가능성?

In [None]:
train=train.drop(['start_latitude','start_longitude'],axis=1)
test=test.drop(['start_latitude','start_longitude'],axis=1)

In [None]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler

min_max = ['end_latitude','end_longitude']
train_num = train[min_max].copy()

test_num = test[min_max].copy()


scaler=QuantileTransformer(n_quantiles=900)

scaler.fit(train_num)

train_num[min_max]=scaler.transform(train_num[min_max])



test_num[min_max]=scaler.transform(test_num[min_max]) # leakage 주의


train[min_max] = train_num[min_max]
test[min_max] = test_num[min_max]  

# 데이터 분리.

In [None]:
from sklearn.model_selection import train_test_split
y=train['target']
# target분리.
train=train.drop(['id','vehicle_restricted','height_restricted','road_name', 'start_node_name', 'end_node_name','road_in_use'],axis=1)
# 학습에 필요없는 문자열, 인덱스 , 모든행이 0인컬럼. target 삭제.
#x_train , x_valid , y_train, y_valid = train_test_split(train,y,test_size=0.2,random_state=5)
# target열이 섞여있어, 임의로 뒤에 20프로 검증세트로 사용. -> test셋을 예측하는 것이므로, 일반화 성능을 확인할 방법이 ㅇ벗음.
# 모델의 일반화 성능이 검증되면(하이퍼파라미터), 전체세트로 사용해야함.


In [None]:
train=train.drop('target',axis=1)

In [None]:
train=train.drop(['day'],axis=1)

In [None]:
train.shape

In [None]:
#train=train.drop(['year'],axis=1)

In [None]:
test.day.nunique()

In [None]:
# test셋에서도 동일하게 하기.
test=test.drop(['id','vehicle_restricted','height_restricted','road_name','road_in_use', 'start_node_name', 'end_node_name'],axis=1)
#test=test.drop(['year'],axis=1)
test.shape

In [None]:
test=test.drop(['day'],axis=1)

In [None]:
#test=test.drop(['year'],axis=1)

In [None]:
test

---------------------------------------------------------

# Outlier -> IQR # 했을 떄 성능이 낮아짐.

In [None]:
def draw_line(plt,col):
  mean= train.describe().loc['mean',col]
  m25 = train.describe().loc['25%',col]
  m50 = train.describe().loc['50%',col]
  m75 = train.describe().loc['75%',col]

  plt.axvline(mean,color='red')
  plt.axvline(m25,color='blue')
  plt.axvline(m50,color='green')
  plt.axvline(m75,color='purple')

  plt.legend(['Mean','25%','50%','75%'])

pp=sns.histplot(train['start_latitude'])
draw_line(pp,'start_latitude')

In [None]:
train.head(4)

In [None]:
# base_hour와 target과의 상관관계가 낮아서, 각 시간별 분포에서 outlier를 삭제.
def outlier_proc(train,col):

  df_train=train.copy()

  print('outlier잡기전 : ',df_train.shape)
  for i in train[col].unique():
    df = df_train[df_train[col]==i]
    q1,q3 = np.percentile(df['target'],[25,75])

    print(i,'시',q1, q3)
    iqr = q3 -q1
    print('저점',(q1-(iqr*1.5)))
    print('고점',(q3+(iqr*1.5)))
    low=(q1-(iqr*1.5))
    high=(q3+(iqr*1.5))
    if low <=0 :
      train=train.drop(index=df[df['target']>high].index)
    else :
      train=train.drop(index=df[df['target']<low].index)
      train=train.drop(index=df[df['target']>high].index)
  print('outlier잡은후 : ',train.shape)
  return train

train=outlier_proc(train,'base_hour')
    
''''condition1 = df['target']<(q1-(iqr*1.5))
  
print('lower_bound : ', condition1.shape[0])

  condition2 = df['target']>(q3+(iqr*1.5))
  print('upper_bound : ',condition2.shape[0])
  print(condition2.shape[0])
  print(condition1.index)

 정규분포일때만 가능.'''

In [None]:
train=outlier_proc(train,'road_rating')
train=outlier_proc(train,'lane_count')

In [None]:
train.shape

# 위,경도 컬럼 Scaling

In [None]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler,MinMaxScaler

min_max = ['start_latitude','start_longitude','end_latitude','end_longitude']
train_num = train[min_max].copy()

test_num = test[min_max].copy()


scaler=QuantileTransformer(n_quantiles=800)


scaler.fit(train_num)

train_num[min_max]=scaler.transform(train_num[min_max])


# leakage 주의

test_num[min_max]=scaler.transform(test_num[min_max])

train[min_max] = train_num[min_max]
test[min_max] = test_num[min_max]   

In [None]:
train.head(4)

# 하중제한 라벨링

In [None]:
train['weight_restricted'].unique()

In [None]:
train['weight_restricted'][train[train['weight_restricted']==32400.].index]=1
train['weight_restricted'][train[train['weight_restricted']==43200.].index]=2
train['weight_restricted'][train[train['weight_restricted']==50000.].index]=3

# train의 범주 값에 맞추어 test도 라벨링
train['weight_restricted']=train['weight_restricted'].astype('int')

test['weight_restricted'][test[test['weight_restricted']==32400.].index]=1
test['weight_restricted'][test[test['weight_restricted']==43200.].index]=2
test['weight_restricted'][test[test['weight_restricted']==50000.].index]=3

test['weight_restricted']=test['weight_restricted'].astype('int')

In [None]:
test['weight_restricted'].unique()

In [None]:
train.shape

In [None]:
# 최대 속도제한 float -> int형으로 변환
train.maximum_speed_limit = train['maximum_speed_limit'].astype('int')

In [None]:
test.maximum_speed_limit = test['maximum_speed_limit'].astype('int')

In [None]:
train.info()

# road_rating 라벨링 ( 클러스터링 적용한 c_label 적용시 실행하지 말것.)

In [None]:
train.dtypes

In [None]:
train.loc[train['road_rating']==103,'road_rating'] = 1
train.loc[train['road_rating']==106,'road_rating'] = 2
train.loc[train['road_rating']==107,'road_rating'] = 3
test.loc[test['road_rating']==103,'road_rating'] = 1
test.loc[test['road_rating']==106,'road_rating'] = 2
test.loc[test['road_rating']==107,'road_rating'] = 3

In [None]:
train.road_type.unique()

#XGBOOST전처리

In [None]:
train.columns

In [None]:
# day_of_week base_hour lane_count road_rating maximum_speed_limit road_type start_turn_restricted c_label 
# year month

In [None]:
train

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
cat_features = ['road_rating','road_type','start_turn_restricted','c_label','year','month']
for i in cat_features:
  train[i]=train[i].astype('str')
  test[i]=test[i].astype('str')
  

In [None]:
ct = ColumnTransformer([('onehot',OneHotEncoder(sparse=False,handle_unknown = 'ignore'),cat_features)])
pipe = Pipeline([('scaling',ct)])
train_transformed=pipe.fit_transform(train[cat_features])

train_transformed

In [None]:
train_transformed.shape

#XGBOOST

In [None]:

import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(data=x_train, label = y_train)
dtest = xgb.DMatrix(data=x_valid, label=y_valid)

In [None]:
params = {'max_depth' : 7,
         'eta' : 0.18, 
         'objective' : 'reg:linear',
         'eval_metric' : 'mae',
         'early_stoppings' : 100,
          'gpu-id':0,'tree_method' : 'gpu_hist',
          'gamma':0.3
          }

num_rounds = 15000

In [None]:
wlist = [(dtrain, 'train'), (dtest,'eval')]
# 하이퍼 파라미터와 early stopping 파라미터를 train() 함수의 파라미터로 전달
xgb_model = xgb.train(params = params, dtrain=dtrain,verbose_eval=100, num_boost_round=num_rounds, evals=wlist)

# Catboost

In [None]:
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Fold_Staking ( 시도x)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    # 지정된 n_folds 값으로 KFold 생성
    kf = KFold(n_splits=n_folds, shuffle=False,random_state=None)
    
    # 추후 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__,' model 시작')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 세트 추출
        print('\t 폴드 세트: ',folder_counter+1,' 시작')
        X_tr = X_train_n.loc[train_index]
        y_tr = y_train_n.loc[train_index]
        X_te = X_train_n.loc[valid_index]
        
        # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행
        model.fit(X_tr, y_tr)
        # 폴드 세트 내부에서 다시 만들어지 검증 데이터로 기반 모델 예측 후 데이터 저장
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
        test_pred[:, folder_counter] = model.predict(X_test_n)
        
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    # train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred, test_pred_mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
one_hot = ['month','day_of_week','road_type','road_rating','weight_restricted','lane_count','road_rating','maximum_speed_limit']
c_train = train.copy()
c_train[one_hot] = c_train[one_hot].astype('str') # adaboosting과 randomforest는 cat_features를 알려줄 수 없어서, str처리 혹은 one_hot을 해야한다.
rf_clf = RandomForestRegressor(n_estimators=100)
ada_clf = AdaBoostRegressor(n_estimators=100)
cat_clf = cb.CatBoostRegressor(iterations=1000,learning_rate=.87,l2_leaf_reg=0.5,cat_features=one_hot)


rf_train, rf_test = get_stacking_base_datasets(rf_clf, c_train, y, test, 4) # 4는 폴드수
ada_train, ada_test = get_stacking_base_datasets(ada_clf, c_train, y, test, 4)
cat_train, cat_test = get_stacking_base_datasets(cat_clf, train, y, test, 4)

In [None]:
a=train[train['weight_restricted']==1].index
train.loc[a]

In [None]:

Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)
print('원본 학습 피처 데이터 shape:', X_train.shape, '원본 테스트 피처 shape:',X_test.shape)
print('스태킹 학습 피처 데이터 shape:',Stack_final_X_train.shape,
     '스태킹 테스트 피처 데이터 shape:',Stack_final_X_test.shape)

In [None]:
import catboost as cb
from sklearn.metrics import mean_absolute_error 

#GridSearch (시도 x)

In [None]:
train.columns

In [None]:
from sklearn.model_selection import StratifiedKFold
one_hot = ['month','day_of_week','road_type','road_rating','weight_restricted','lane_count','road_rating','maximum_speed_limit']
model = cb.CatBoostRegressor(loss_function='MAE',one_hot_max_size=10,metric_period=50)
#cb_model.fit(X_train, Y_train, cat_features = cat_features)
grid = {'iterations' : [50000],
    'learning_rate': [0.3]
        ,'l2_leaf_reg': [0.5],
        'depth':[8]}
# stratified 적용 x
#skf=StratifiedKFold(n_splits=4,random_state=None,shuffle=False)
grid_search_result = model.grid_search(grid, 
                                       X=bf_train, 
                                       y=y, 
                                       plot=True,cv=4)

https://velog.io/@dohy426/Cat-Boost-Regressor-Optuna

In [None]:
print(grid_search_result.best_params_)
print(grid_search_result.best_score_)

em = grid_search_result.best_estimator_
pred = em.predict(bf_test)

sample_submission = pd.read_csv('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/sample_submission.csv')
sample_submission['target']=pred
sample_submission.to_csv('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/submission_14.csv',index=False)

# BF (x_train, x_valid) 적용. -> 이걸로 진행.

- 그리드서치를 적용 하지 않아, 검증셋과 학습셋 성능차이가 있다.

In [None]:
bf_train.dtypes

In [None]:
cat_type = ['day_of_week','start_turn_restricted','peak_c']
for i in cat_type:
  train[i]=train[i].astype('str')
  test[i]=test[i].astype('str')

# Optuna를 통한 최적 파라미터 탐색 ( 시도 x)
- trial은 50이하로 해야 적당하다고 한다.

In [None]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

In [None]:
bf_train = train.copy()
bf_test = test.copy()
bf_train = bf_train.drop(['multi_linked','connect_code','end_turn_restricted'],axis=1) # road_type
bf_test = bf_test.drop(['multi_linked','connect_code','end_turn_restricted'],axis=1) # road_type

In [None]:
 x_train , x_valid , y_train, y_valid = train_test_split(bf_train,y,test_size=0.2,random_state=10)

In [None]:
# 데이터 분리에서 x_train , x_valid , y_train , y_valid 
#분리해놓은 것을 통해 최적 파라미터를 탐색한다.

# random sampler
sampler = TPESampler(seed=10)

# define function
def objective(trial):

    cbrm_param = {
        'iterations':2000,
        #'od_wait':trial.suggest_int('od_wait', 500, 2300),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.1, 0.4),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        #'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',5, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        
    }

    # Generate model
    cat_features=['day_of_week','road_rating','month','year','road_type','start_turn_restricted','c_label']
    model_cbrm = cb.CatBoostRegressor(**cbrm_param,one_hot_max_size=6,metric_period=100,cat_features=cat_features)
    model_cbrm = model_cbrm.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
                           
	# 평가지표 원하는 평가 지표가 있을 시 바꾸어 준다.
    MAE = mean_absolute_error(y_valid, model_cbrm.predict(x_valid))
    return MAE

optuna_cbrm = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cbrm.optimize(objective, n_trials=10)


In [None]:
cbrm_trial = optuna_cbrm.best_trial
cbrm_trial_params = cbrm_trial.params
print('Best Trial: score {},\nparams {}'.format(cbrm_trial.best_value, cbrm_trial_params))

In [None]:
train.dtypes

In [None]:
train.info()
#obj : road_rating, road_type , road_num

train.road_rating=train.road_rating.astype('str')
train.road_type=train.road_type.astype('int')
train.c_label=train.c_label.astype('str')
train.day_of_week = train.day_of_week.astype('str')
train.start_turn_restricted = train.start_turn_restricted.astype('str')
train.month = train.month.astype('int')
train.year = train.year.astype('int')
#test

test.road_rating=test.road_rating.astype('str')
test.road_type=test.road_type.astype('int')
test.c_label=test.c_label.astype('str')
test.day_of_week = test.day_of_week.astype('str')
test.start_turn_restricted = test.start_turn_restricted.astype('str')
train.year = train.year.astype('int')
train.month = train.month.astype('int')

In [None]:
bf_train.info()

In [None]:
train.head()

# K-Fold 적용해보기.

In [None]:
skf = KFold(n_splits=4, shuffle=True, random_state=10)
folds = []
for train_idx, valid_idx in skf.split(train, y):
  folds.append((train_idx,valid_idx))

In [None]:
from catboost.core import CatBoostRegressor
random.seed(10)
cat_models={}

cat_features =[0,1,2,5,6,7,8,15,18]

for fold in range(4):
  print(f'===================================={fold+1}============================================')
  train_idx, valid_idx = folds[fold]
  X_train = train.drop(['credit'],axis=1).iloc[train_idx].values 
  X_valid = train.drop(['credit'],axis=1).iloc[valid_idx].values
  y_train = train['credit'][train_idx].values
  y_valid = train['credit'][valid_idx].values

  cat = CatBoostRegressor(**cbrm_trial.best_params)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_models[fold] = cat

In [None]:
x_train , x_valid , y_train, y_valid = train_test_split(bf_train,y,test_size=0.2,random_state=5)

In [None]:
x_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)

x_valid.reset_index(inplace=True, drop=True)
y_valid.reset_index(inplace=True, drop=True)

In [None]:
bf_test.info()

In [None]:
bf_test.info()

In [None]:
bf_train = bf_train.astype({'day_of_week' : 'int', 'road_rating' : 'str','month' : 'int','year':'int','road_type':'str','start_turn_restricted':'str','c_label':'str'})
bf_test = bf_test.astype({'day_of_week' : 'str', 'road_rating' : 'str','month' : 'str','year':'str','road_rating':'str','road_type':'str','start_turn_restricted':'str','c_label':'str'})

In [None]:
bf_train['base_hour']=bf_train['base_hour'].astype('str')
bf_test['base_hour']=bf_test['base_hour'].astype('str')

In [None]:
bf_train['maximum_speed_limit']=bf_train['maximum_speed_limit'].astype('int')
bf_test['maximum_speed_limit']=bf_test['maximum_speed_limit'].astype('int')

In [None]:
bf_train = bf_train.astype({'day_of_week' : 'int', 'road_rating' : 'int','road_type':'int','start_turn_restricted':'str','c_label':'str','weight_restricted':'str'})
bf_test = bf_test.astype({'day_of_week' : 'int', 'road_rating' : 'int','road_type':'int','start_turn_restricted':'str','c_label':'str','weight_restricted':'str'})

In [None]:
bf_test.dtypes

In [None]:
bf_train.dtypes

In [None]:
catt=['road_type','road_rating','maximum_speed_limit']

for i in catt :
  train[i]=train[i].astype('str')
  test[i]=test[i].astype('str')

In [None]:
# day 까지 포함해서 모델링!! -> 기각

#one_hot = ['month','day_of_week','road_type','weight_restricted','road_rating','maximum_speed_limit']

cat_features=['day_of_week','start_turn_restricted','peak_c','road_type','road_rating','maximum_speed_limit']
train_dataset=cb.Pool(data=x_train,label=y_train,cat_features=cat_features)
valid_dataset=cb.Pool(data=x_valid,label=y_valid,cat_features=cat_features)
# task_type = 'GPU'로 하면 처리속도가 엄청 향상되는데, CPU로 작업했을 때랑 성능이 다르다.... 왜 그런지 모르겠다.
# 파라미터 : 반복횟수, 학습률, 손실함수, l2규제, 화면표시 몇개당 나오게 할지.

# k-fold 적용해보기.
model = cb.CatBoostRegressor(iterations=60000,learning_rate=0.25,loss_function='MAE',
                             l2_leaf_reg=0.5,metric_period = 100,one_hot_max_size=12,depth=8,cat_features=cat_features)

#valid_set : 3.028->model = cb.CatBoostRegressor(iterations=50000,learning_rate=0.3,loss_function='MAE', test_score : 3.14몇.
                            # l2_leaf_reg=0.5,metric_period = 100,depth=8,one_hot_max_size=25)
model.fit(train_dataset,eval_set = valid_dataset) # valid_set score도 같이 표시됨.

pred=model.predict(x_valid) 

print(mean_absolute_error(y_valid,pred))



learn: 3.0512080 -> valid score : 3.119756768711377
-> test_score : 3.20712

learn: 2.9782695 -> valid score : 3.0890392776
-> test_score : 3.1793

learn: 2.9182695 -> valid score : 3.0709392776
-> test_score : 3.1717


In [None]:
bf_test.off_day.value_counts()

In [None]:
# 예측 및 제출 파일 제출.
sub_pred=model.predict(bf_test)

print(sub_pred)

In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/sample_submission.csv')

In [None]:
sample_submission['target']=sub_pred

In [None]:
sample_submission.to_csv('/content/drive/MyDrive/DACON_Dataset/JEJU_Traffic/open.zip (Unzipped Files)/submission_last.csv',index=False)

# 파일명 변경하기.

# Featrue_Importance

In [None]:
# 특성 중요도 뽑기.
def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
plot_feature_importance(model.get_feature_importance(),x_valid.columns,'CATBOOST')