## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
import warnings;warnings.filterwarnings(action='ignore')

## Read data

In [2]:
X_train = pd.read_csv('../data/train.csv')
y_train = X_train['TARGET'] ; X_train.drop(['TARGET'], axis=1, inplace=True)
X_test = pd.read_csv('../data/test.csv')

In [3]:
print('학습데이터 수:', X_train.shape)
print('평가데이터 수:', X_test.shape)

학습데이터 수: (252289, 18)
평가데이터 수: (79786, 18)


In [4]:
'''
데이터 설명
- sessionID : 세션 ID
- userID : 사용자 ID
- TARGET : 세션에서 발생한 총 조회수
- browser : 사용된 브라우저
- OS : 사용된 기기의 운영체제
- device : 사용된 기기
- new : 첫 방문 여부 (0: 첫 방문 아님, 1: 첫 방문)
- quality : 세션의 질 (거래 성사를 기준으로 측정된 값, 범위: 1~100)
- duration : 총 세션 시간 (단위: 초)
- bounced : 이탈 여부 (0: 이탈하지 않음, 1: 이탈함)
- transaction : 세션 내에서 발생의 거래의 수
- transaction_revenue : 총 거래 수익
- continent : 세션이 발생한 대륙
- subcontinent : 세션이 발생한 하위 대륙
- country : 세션이 발생한 국가
- traffic_source : 트래픽이 발생한 소스
- traffic_medium : 트래픽 소스의 매체
- keyword : 트래픽 소스의 키워드, 일반적으로 traffic_medium이 organic, cpc인 경우에 설정
- referral_path : traffic_medium이 referral인 경우 설정되는 경로
'''
display(X_train.head())

Unnamed: 0,sessionID,userID,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,SESSION_000000,USER_000000,Chrome,Macintosh,desktop,0,45.0,839.0,0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
1,SESSION_000001,USER_000001,Chrome,Windows,desktop,1,1.0,39.0,0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,
2,SESSION_000002,USER_000002,Samsung Internet,Android,mobile,1,1.0,0.0,1,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),,
3,SESSION_000003,USER_000003,Chrome,Macintosh,desktop,1,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,,
4,SESSION_000004,USER_000004,Chrome,iOS,mobile,0,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,,Category6_Path_0000


### 결측치

In [10]:
X_train.traffic_medium.unique()

array(['organic', '(none)', 'affiliate', 'referral', 'cpc', 'cpm',
       '(not set)'], dtype=object)

In [13]:
X_train.query('traffic_medium == "(not set)"').traffic_source.unique()

array(['Partners', 'google'], dtype=object)

In [5]:
# keyword와 referral_path에 결측치가 있다.
X_train.isna().sum()

sessionID                   0
userID                      0
browser                     0
OS                          0
device                      0
new                         0
quality                     0
duration                    0
bounced                     0
transaction                 0
transaction_revenue         0
continent                   0
subcontinent                0
country                     0
traffic_source              0
traffic_medium              0
keyword                137675
referral_path          161107
dtype: int64

In [9]:
X_train.traffic_source.unique()

array(['google', '(direct)', 'Partners', 'groups.google.com',
       'youtube.com', 'google.com', 'bing', 'google.co.jp',
       'analytics.google.com', 'baidu', 'quora.com',
       'googleads.g.doubleclick.net', 'pinterest.com', 'mail.google.com',
       'dealspotr.com', 'qiita.com', 'l.facebook.com', 'reddit.com',
       'sites.google.com', 'blog.golang.org', 'support.google.com',
       'yahoo', 'dfa', 't.co', 'lunametrics.com', 'docs.google.com',
       'adwords.google.com', 's0.2mdn.net', 'm.baidu.com',
       'm.facebook.com', 'int.search.tb.ask.com', 'google.com.br',
       'google.es', 'arstechnica.com', 'msn.com', 'tw.search.yahoo.com',
       'facebook.com', 'google.de', 'hangouts.google.com',
       'duckduckgo.com', 'r.search.aol.com', 'my.yahoo.com',
       'google.co.za', 'au.search.yahoo.com', 'in.search.yahoo.com',
       'outlook.live.com', 'productforums.google.com', 'ask',
       'google.co.id', 'google.co.uk', 'myactivity.google.com',
       'google.com.ua', 'google

In [6]:
X_train['keyword'] = X_train['keyword'].str.split('_', expand=True)[0]
X_test['keyword'] = X_test['keyword'].str.split('_', expand=True)[0]

X_train['keyword'].fillna(X_train['keyword'].mode()[0], inplace=True)
X_test['keyword'].fillna(X_test['keyword'].mode()[0], inplace=True)

In [6]:
X_train['referral_path'] = X_train['referral_path'].str.split('_', expand=True)[0]
X_test['referral_path'] = X_test['referral_path'].str.split('_', expand=True)[0]

X_train['referral_path'].fillna(X_train['referral_path'].mode()[0], inplace=True)
X_test['referral_path'].fillna(X_test['referral_path'].mode()[0], inplace=True)

In [7]:
# 불필요한 열 제거
X_train.drop(['keyword'], axis=1, inplace=True)
X_test.drop(['keyword'], axis=1, inplace=True)

In [8]:
# Category type으로 변경
categorical_features = ['browser', 'OS', 'device', 'new', 'bounced', 'continent', 'subcontinent',
                        'country', 'traffic_source', 'traffic_medium','referral_path']
                        
for i in categorical_features:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')

### 변수 제거하기

In [None]:
# ID열 제거하기
X_train.drop(['sessionID','userID'], axis=1, inplace=True)
X_test.drop(['sessionID','userID'], axis=1, inplace=True)

## HyperParameter Tunning

In [9]:
model = CatBoostRegressor(cat_features = categorical_features, random_state=2024, verbose=False)

## Kfold

In [10]:
KF = KFold(n_splits = 3, shuffle = True, random_state = 2024)

cat_pred = np.zeros(X_test.shape[0])
rmse_list = []
for tr_idx, val_idx in KF.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    model.fit(tr_x, tr_y)
    pred = model.predict(val_x)
    rmse = mean_squared_error(val_y, pred)**0.5
    rmse_list.append(rmse)
    
    sub_pred = np.array(model.predict(X_test)) / 3  # averaging
    cat_pred += sub_pred
print(f'{model.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

CatBoostRegressor의 10fold 평균 RMSE는 2.7332039875781162


In [11]:
# 0보다 작은 값을 0으로 보정한다.
cat_pred = [0 if i < 0 else i for i in cat_pred]

In [12]:
# feature importance
model.get_feature_importance()

array([ 1.94840673,  3.35521277,  1.29113915,  4.30298655, 41.66174928,
       28.25875626,  0.40794133,  0.66779683,  1.82488301,  2.77130112,
        6.87719499,  1.21923291,  1.5348814 ,  2.56468709,  1.31383057])

## Save data

In [13]:
sample = pd.read_csv('data/submission/sample_submission.csv')
sample['TARGET'] = cat_pred

In [14]:
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')

./data/submission/20240221_0002 is saved.
