## Import

In [1]:
import pandas as pd
import numpy as np ; np.random.seed(2024)
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

## Read data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print('학습데이터 수:', train.shape)
print('평가데이터 수:', test.shape)

학습데이터 수: (252289, 19)
평가데이터 수: (79786, 18)


In [3]:
'''
데이터 설명
- sessionID : 세션 ID
- userID : 사용자 ID
- TARGET : 세션에서 발생한 총 조회수
- browser : 사용된 브라우저
- OS : 사용된 기기의 운영체제
- device : 사용된 기기
- new : 첫 방문 여부 (0: 첫 방문 아님, 1: 첫 방문)
- quality : 세션의 질 (거래 성사를 기준으로 측정된 값, 범위: 1~100)
- duration : 총 세션 시간 (단위: 초)
- bounced : 이탈 여부 (0: 이탈하지 않음, 1: 이탈함)
- transaction : 세션 내에서 발생의 거래의 수
- transaction_revenue : 총 거래 수익
- continent : 세션이 발생한 대륙
- subcontinent : 세션이 발생한 하위 대륙
- country : 세션이 발생한 국가
- traffic_source : 트래픽이 발생한 소스
- traffic_medium : 트래픽 소스의 매체
- keyword : 트래픽 소스의 키워드, 일반적으로 traffic_medium이 organic, cpc인 경우에 설정
- referral_path : traffic_medium이 referral인 경우 설정되는 경로
'''
display(train.head())

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,
2,SESSION_000002,USER_000002,1.0,Samsung Internet,Android,mobile,1,1.0,0.0,1,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),,
3,SESSION_000003,USER_000003,1.0,Chrome,Macintosh,desktop,1,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,,
4,SESSION_000004,USER_000004,1.0,Chrome,iOS,mobile,0,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,,Category6_Path_0000


## <font color='forestgreen'>Data Cleansing

### 결측치
  - keyword, referral_path에는 traffic_medium값에 따른 결측치가 존재한다.
  - traffic_medium 외 OS, continent, subcontinent, country에도 (not set)으로 결측치가 존재한다.

In [4]:
train.isna().sum()

sessionID                   0
userID                      0
TARGET                      0
browser                     0
OS                          0
device                      0
new                         0
quality                     0
duration                    0
bounced                     0
transaction                 0
transaction_revenue         0
continent                   0
subcontinent                0
country                     0
traffic_source              0
traffic_medium              0
keyword                137675
referral_path          161107
dtype: int64

In [5]:
# keyword, referral_path의 값을 결정하는 traffic_medium의 빈도를 살펴보니 결측치로 판단할 수 있는 (none), (not set)이 있다.
train.traffic_medium.value_counts()

organic      107370
referral      70047
(none)        59022
cpc            9978
affiliate      5365
cpm             501
(not set)         6
Name: traffic_medium, dtype: int64

In [6]:
# (none)은 직접 트래픽이라 트래픽 매체가 없는 것이고 (not set)은 결측치로 봐야한다.
print('(none)의 트래픽 소스:', train.query('traffic_medium == "(none)"').traffic_source.unique())
print('(not set)의 트래픽 소스:', train.query('traffic_medium == "(not set)"').traffic_source.unique())

(none)의 트래픽 소스: ['(direct)']
(not set)의 트래픽 소스: ['Partners' 'google']


In [7]:
# (not set)으로 결측치가 표현된 다른 변수가 있는지 확인한 결과 OS, continent, subcontinent, country에서도 나타난다.
train.applymap(lambda x: x=="(not set)").sum()

sessionID                 0
userID                    0
TARGET                    0
browser                   0
OS                     2592
device                    0
new                       0
quality                   0
duration                  0
bounced                   0
transaction               0
transaction_revenue       0
continent               336
subcontinent            336
country                 336
traffic_source            0
traffic_medium            6
keyword                   0
referral_path             0
dtype: int64

In [8]:
# (not set)은 모두 결측치로 변경한다.
# # test 데이터의 경우 browser에도 결측치가 있다.
# # 대체할 수가 없음. 제거하거나 not set 그대로 두거나.
# train = train.replace('(not set)', np.nan)
# test = test.replace('(not set)', np.nan)

# display(train.isna().sum())
# display(test.isna().sum())

sessionID                   0
userID                      0
TARGET                      0
browser                     0
OS                       2592
device                      0
new                         0
quality                     0
duration                    0
bounced                     0
transaction                 0
transaction_revenue         0
continent                 336
subcontinent              336
country                   336
traffic_source              0
traffic_medium              6
keyword                137675
referral_path          161107
dtype: int64

sessionID                  0
userID                     0
browser                    2
OS                       939
device                     0
new                        0
quality                    0
duration                   0
bounced                    0
transaction                0
transaction_revenue        0
continent                 97
subcontinent              97
country                   97
traffic_source             0
traffic_medium             6
keyword                43070
referral_path          53891
dtype: int64

In [8]:
# traffic_medium 중 광고로 접속한 cpc, cpm은 ad(광고)로 변경한다.
train['traffic_medium'] = train['traffic_medium'].replace('cpc','ad').replace('cpm','ad')
test['traffic_medium'] = test['traffic_medium'].replace('cpc','ad').replace('cpm','ad')

# (not set)은 결측치로 학습데이터 내 트래픽 소스의 최빈값으로 채운다.
mode = train.groupby('traffic_source')['traffic_medium'].agg(lambda x: x.mode()[0])
ease = train.query('traffic_medium == "(not set)"')['traffic_source'].map(mode)
train.loc[ease.index, 'traffic_medium'] = ease.values

ease = test.query('traffic_medium == "(not set)"')['traffic_source'].map(mode)
test.loc[ease.index, 'traffic_medium'] = ease.values

### 이상치

In [None]:
# 주요 변수들간 pairplot을 그려본다.
plt.figure(figsize=(20,20))
sns.pairplot(train[['TARGET','new','quality','duration','bounced','transaction','transaction_revenue']], color='forestgreen')
plt.show()

In [None]:
# 위의 pairplot을 통해 bounced가 1이면 TARGET은 1, duration, transaction, transaction_revenue은 0임을 알아냈다.
train.query('bounced==1')[['TARGET','duration','transaction','transaction_revenue']].drop_duplicates()

In [9]:
# transaction의 최댓값, TARGET의 최댓값은 이상치로 보인다.
outlier = [train.transaction.sort_values().index[-1], train.TARGET.sort_values().index[-1]]
train.drop(outlier, inplace=True)
print('학습데이터 크기:', train.shape)

학습데이터 크기: (252287, 19)


### 변수 정리

In [10]:
# 불필요한 변수는 제거한다.
train.drop(['keyword','referral_path'], axis=1, inplace=True)
test.drop(['keyword','referral_path'], axis=1, inplace=True)

In [11]:
X_train['keyword'] = X_train['keyword'].str.split('_', expand=True)[0]
X_test['keyword'] = X_test['keyword'].str.split('_', expand=True)[0]

X_train['keyword'].fillna(X_train['keyword'].mode()[0], inplace=True)
X_test['keyword'].fillna(X_test['keyword'].mode()[0], inplace=True)

NameError: name 'X_train' is not defined

In [None]:
X_train['referral_path'] = X_train['referral_path'].str.split('_', expand=True)[0]
X_test['referral_path'] = X_test['referral_path'].str.split('_', expand=True)[0]

X_train['referral_path'].fillna(X_train['referral_path'].mode()[0], inplace=True)
X_test['referral_path'].fillna(X_test['referral_path'].mode()[0], inplace=True)

## <font color='forestgreen'> Feature Engineering

### Feature Generation

In [11]:
train['QD'] = train['quality'] / (train['duration']+1)
test['QD'] = test['quality'] / (test['duration']+1)

### Encoding

In [14]:
# 학습데이터에 한하여 Target encoding 한다.
category = ['device', 'traffic_source', 'traffic_medium']
                        
for i in category:
    rate = train.groupby(i)['TARGET'].mean()
    train[i] = train[i].map(rate)
    test[i] = test[i].map(rate)
    test[i].fillna(train[i].mean(), inplace=True)

### Feature Transformation

In [15]:
numeric = ['quality', 'duration','transaction', 'transaction_revenue', 'QD', 'TQ', 'rate_revenue']

scaler = MinMaxScaler()
train[numeric] = scaler.fit_transform(train[numeric])
test[numeric] = scaler.transform(test[numeric])

### Feature Selection

In [16]:
# ID열을 제거한다.
train.drop(['sessionID'], axis=1, inplace=True)
test.drop(['sessionID'], axis=1, inplace=True)

In [43]:
unuse = ["TQ","rate_revenue",'transaction','continent']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)

In [None]:
t

In [44]:
# 2.5614

tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)

model = CatBoostRegressor(iterations=500, cat_features = ['userID','new', 'bounced','browser', 'OS','continent', 'subcontinent', 'country'], 
                          objective='Poisson', eval_metric='RMSE',
                          random_state=2024, verbose=False)
model.fit(tr_x, tr_y)
print('RMSE:', mean_squared_error(model.predict(val_x), val_y)**0.5)

ValueError: 'continent' is not in list

In [None]:
# Catboost를 base로 삼고 importance가 낮은 feature는 제거한다.
importance = pd.DataFrame({'feature': model.feature_names_,
                           'importance':model.feature_importances_})
importance

In [41]:
ease = pd.concat([test, pd.Series(model.predict(X_test), index=test.index)], axis=1)

In [42]:
ease.query('bounced==1').iloc[:,-1].round().value_counts()

1.0    39633
2.0        5
Name: 0, dtype: int64

## <font color='forestgreen'> HyperParameter Tunning

## <font color='forestgreen'> Predict & Save data

In [None]:
KF = KFold(n_splits = 4, shuffle = True, random_state = 2024)

cat_pred = np.zeros(X_test.shape[0])
rmse_list = []
for tr_idx, val_idx in KF.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    model.fit(tr_x, tr_y)
    pred = model.predict(val_x)
    rmse = mean_squared_error(val_y, pred)**0.5
    rmse_list.append(rmse)
    
    sub_pred = np.array(model.predict(X_test)) / 4  # averaging
    cat_pred += sub_pred
print(f'{model.__class__.__name__}의 4 fold 평균 RMSE는 {np.mean(rmse_list)}')

In [None]:
sample = pd.read_csv('data/submission/sample_submission.csv')
sample['TARGET'] = [0 if i < 0 else i for i in pred]

In [None]:
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')