## Import

In [52]:
import pandas as pd
import numpy as np ; np.random.seed(2024)
import warnings;warnings.filterwarnings(action='ignore')

# Feature Engineering
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

# Modeling
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor, BayesianRidge
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

## Read data

In [53]:
train = pd.read_csv('data/stayed_train.csv')
test = pd.read_csv('data/stayed_test.csv')
print('학습데이터 수:', train.shape)
print('평가데이터 수:', test.shape)

학습데이터 수: (127739, 20)
평가데이터 수: (40148, 19)


In [54]:
'''
데이터 설명
- sessionID : 세션 ID
- userID : 사용자 ID
- TARGET : 세션에서 발생한 총 조회수
- browser : 사용된 브라우저
- OS : 사용된 기기의 운영체제
- device : 사용된 기기
- new : 첫 방문 여부 (0: 첫 방문 아님, 1: 첫 방문)
- quality : 세션의 질 (거래 성사를 기준으로 측정된 값, 범위: 1~100)
- duration : 총 세션 시간 (단위: 초)
- bounced : 이탈 여부 (0: 이탈하지 않음, 1: 이탈함)
- transaction : 세션 내에서 발생의 거래의 수
- transaction_revenue : 총 거래 수익
- continent : 세션이 발생한 대륙
- subcontinent : 세션이 발생한 하위 대륙
- country : 세션이 발생한 국가
- traffic_source : 트래픽이 발생한 소스
- traffic_medium : 트래픽 소스의 매체
- keyword : 트래픽 소스의 키워드, 일반적으로 traffic_medium이 organic, cpc인 경우에 설정
- referral_path : traffic_medium이 referral인 경우 설정되는 경로
'''
display(train.head())

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword1,keyword2,referral_path1,referral_path2
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,(not set),(not set),(not set)
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
2,SESSION_000007,USER_000007,5.0,Chrome,Macintosh,desktop,1,1.0,64.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
3,SESSION_000008,USER_000008,5.0,Firefox,Linux,desktop,1,1.0,60.0,0.0,0.0,Americas,South America,Brazil,youtube.com,referral,(not set),(not set),Category5,0002
4,SESSION_000009,USER_000009,3.0,Chrome,Macintosh,desktop,1,2.0,579.0,0.0,0.0,Americas,Northern America,United States,google,ad,Category1,000,(not set),(not set)


## <font color='forestgreen'> Data Cleansing

In [55]:
# browser 내 device 정보가 섞여 범주값이 다양하다. 학습데이터를 기반으로 재구성한다.
# 결측치로 생각하는 (not set)도 포함한다.
unclear_browser = ["Mozilla Compatible Agent", "MRCHROME", "+Simple Browser",
                   'SeaMonkey', 'osee2unifiedRelease', 'YE', 'Browser', 'starmaker', '(not set)']

train['browser'] = train['browser'].apply(lambda x: 'Safari' if 'Safari' in x else
                                                     ('Opera' if 'Opera' in x else
                                                     ('Amazon' if 'Amazon' in x else
                                                     ('BROWSER' if x in unclear_browser else x))))
test['browser'] = test['browser'].apply(lambda x: 'Safari' if 'Safari' in x else
                                                     ('Opera' if 'Opera' in x else
                                                     ('Amazon' if 'Amazon' in x else
                                                     ('BROWSER' if x in unclear_browser else x))))

In [56]:
# 같은 이유로 OS도 재구성한다.
train['OS'] = train['OS'].apply(lambda x: 'Windows' if 'Windows' in x else
                                           ('Nintendo' if 'Nintendo' in x else                                
                                           ('OS' if x in ['OS/2','(not set)'] else x)))
test['OS'] = test['OS'].apply(lambda x: 'Windows' if 'Windows' in x else
                                           ('Nintendo' if 'Nintendo' in x else                                
                                           ('OS' if x in ['OS/2','(not set)'] else x)))

In [57]:
# 제도, 섬을 Islands로 변경하고 방향을 의미하는 변수를 생성한다.
island = ['Caribbean', 'Micronesian Region', 'Polynesia', 'Melanesia']
train['subcontinent'] = train['subcontinent'].apply(lambda x: 'Islands' if x in island else x)
test['subcontinent'] = test['subcontinent'].apply(lambda x: 'Islands' if x in island else x)

# continent, subcontinent, country, direction에서 결측치는 (not set)으로 나타낸다.
train['direction'] = train['subcontinent'].str.split(' ', expand=True)[0].replace('(not', '(not set)')
test['direction'] = test['subcontinent'].str.split(' ', expand=True)[0].replace('(not', '(not set)')

In [58]:
# traffic_source를 지역 도메인을 제외해 재구성한다.
# google엔 analytics, mail, groups, adwords & ads, optimize, docs, earth 등이 섞여있다.
train['traffic_source'] = train.traffic_source.str.replace('.com|.net|.org', '')\
                          .apply(lambda x: 'yahoo' if '.yahoo' in x else
                                            ('facebook' if '.facebook' in x else
                                            ('youtube' if '.youtube' in x else
                                            ('pinterest' if '.pinterest' in x else
                                            ('edu' if '.edu' in x else
                                            ('ask' if '.ask' in x else
                                            ('reddit' if '.reddit' in x else
                                            ('google' if 'google' in x else x))))))))

test['traffic_source'] = test.traffic_source.str.replace('.com|.net|.org', '')\
                         .apply(lambda x: 'yahoo' if '.yahoo' in x else
                                            ('facebook' if '.facebook' in x else
                                            ('youtube' if '.youtube' in x else
                                            ('pinterest' if '.pinterest' in x else
                                            ('edu' if '.edu' in x else
                                            ('ask' if '.ask' in x else
                                            ('reddit' if '.reddit' in x else
                                            ('google' if 'google' in x else x))))))))

In [59]:
youtube = ['Category2', 'Category4', 'Category5', 'Category7', 'Category8', 'Category13']
train['referral_path3'] = train['referral_path1'].apply(lambda x: 'youtube' if x in youtube else
                                                                 ('google' if x == "Category9" else
                                                                 ('(direct)' if x == "Category10" else x)))

test['referral_path3'] = test['referral_path1'].apply(lambda x: 'youtube' if x in youtube else
                                                                 ('google' if x == "Category9" else
                                                                 ('(direct)' if x == "Category10" else x)))

## <font color='forestgreen'> Feature Engineering

### Feature Generation

In [60]:
# stayed 데이터는 변수를 생성한다.
train['QD'] = train['quality'] * train['duration']
test['QD'] = test['quality'] * test['duration']

In [61]:
# stayed 데이터는 변수를 생성한다.
train['QN'] = train['quality'] * train['new']
test['QN'] = test['quality'] * test['new']

In [62]:
# stayed 데이터는 변수를 생성한다.
train['DN'] = train['duration'] * train['new']
test['DN'] = test['duration'] * test['new']

### Encoding
- traffic_source, traffic_medium을 세트로 labeling
- os, device

In [63]:
train.fillna('(not set)', inplace=True)
test.fillna('(not set)', inplace=True)

In [64]:
# keyword1, referral_path1은 각 2의 빈도로 채운다.
replace_count = ['keyword','referral_path']

for i in replace_count:
    count = train.groupby(f'{i}1')[f'{i}2'].size()
    train[f'{i}1'] = train[f'{i}1'].map(count)
    test[f'{i}1'] = test[f'{i}1'].map(count)
    test[f'{i}1'].fillna(int(count.mean()))

In [65]:
# 학습데이터에 한하여 Target encoding 한다.
# keyword, referral_path
target = ['browser', 'device', 'traffic_source', 'traffic_medium','direction','keyword2', 'referral_path2', 'referral_path3']
                        
for i in target:
    rate = train.groupby(i)['TARGET'].mean()
    train[i] = train[i].map(rate)
    test[i] = test[i].map(rate)
    test[i].fillna(train[i].mean(), inplace=True)

In [66]:
str_col = ['continent', 'new', 'OS', 'subcontinent', 'country']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

### Feature Transformation

In [67]:
# sqrt transformation을 시도한다.
train['duration'] = train['duration']**0.5
test['duration'] = test['duration']**0.5

In [68]:
# log transformation을 시도한다.
train['quality'] = np.log(train['quality'])
test['quality'] = np.log(test['quality'])

In [47]:
# Scaler
numeric = ['quality', 'duration', 'transaction', 'transaction_revenue', 'QD', 'QN','DN']

scaler = MinMaxScaler()
train[numeric] = scaler.fit_transform(train[numeric])
test[numeric] = scaler.transform(test[numeric])

### Feature Selection

In [69]:
# ID열을 제거한다.
pred_test= test[['sessionID']]

# sessionID 변수를 제거한다.
train.drop(['sessionID'], axis=1, inplace=True)
test.drop(['sessionID'], axis=1, inplace=True)

In [70]:
train.head()

Unnamed: 0,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,...,traffic_medium,keyword1,keyword2,referral_path1,referral_path2,direction,referral_path3,QD,QN,DN
0,USER_000000,17.0,6.577421,6,6.480987,0,3.806662,28.965497,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,6.576706,6.469242,37755.0,0.0,0.0
1,USER_000001,3.0,6.577421,12,6.480987,1,0.0,6.244998,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,5.127151,6.469242,39.0,1.0,39.0
2,USER_000007,5.0,6.577421,6,6.480987,1,0.0,8.0,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,5.127151,6.469242,64.0,1.0,64.0
3,USER_000008,5.0,5.848352,5,6.480987,1,0.0,7.745967,0.0,0.0,...,4.45094,70397,6.095371,1985,3.303922,5.808008,3.780194,60.0,1.0,60.0
4,USER_000009,3.0,6.577421,6,6.480987,1,0.693147,24.062419,0.0,0.0,...,6.598886,906,7.883721,79590,6.681714,6.576706,6.469242,1158.0,2.0,579.0


In [78]:
unuse = ['userID','new','keyword2','transaction']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model = LGBMRegressor(n_estimators=600,  num_leaves=48,learning_rate=0.01,
                          eval_metric='RMSE',
                          random_state=2024)
model.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model.predict(val_x), val_y)**0.5)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1755
[LightGBM] [Info] Number of data points in the train set: 89417, number of used features: 19
[LightGBM] [Info] Start training from score 6.105550
RMSE(train): 3.262890530864312
RMSE: 3.6983442479720083


In [93]:
unuse = ['userID','new','keyword2']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model = CatBoostRegressor(iterations=500, cat_features = ['continent', 'OS', 'subcontinent', 'country','keyword1','referral_path1'], 
                          objective='Poisson', eval_metric='RMSE',
                          random_state=2024, verbose=False)
model.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model.predict(val_x), val_y)**0.5)

RMSE(train): 3.3187519346696193
RMSE: 3.6986404761724585


In [79]:
# Catboost를 base로 삼고 importance가 낮은 feature는 제거한다.
importance = pd.DataFrame({'feature': X_train.columns,
                           'importance':model.feature_importances_})
importance.sort_values(by='importance')

Unnamed: 0,feature,importance
12,referral_path1,329
9,traffic_source,567
10,traffic_medium,664
6,continent,717
14,direction,727
0,browser,796
2,device,865
15,referral_path3,915
11,keyword1,949
5,transaction_revenue,1049


In [94]:
# Catboost를 base로 삼고 importance가 낮은 feature는 제거한다.
importance = pd.DataFrame({'feature': X_train.columns,
                           'importance':model.feature_importances_})
importance.sort_values(by='importance')

Unnamed: 0,feature,importance
5,transaction,0.068875
6,transaction_revenue,0.167912
9,country,0.219844
16,referral_path3,0.532891
2,device,0.778378
12,keyword1,0.79276
11,traffic_medium,0.817248
0,browser,0.941492
10,traffic_source,0.960373
13,referral_path1,1.09101


## <font color='forestgreen'> HyperParameter Tunning

In [80]:
X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
model = LGBMRegressor(n_estimators=600,  num_leaves=48,learning_rate=0.01,
                          eval_metric='RMSE',
                          random_state=2024)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the train set: 127739, number of used features: 19
[LightGBM] [Info] Start training from score 6.116260


LGBMRegressor(eval_metric='RMSE', learning_rate=0.01, n_estimators=600,
              num_leaves=48, random_state=2024)

In [99]:
X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
model = CatBoostRegressor(iterations=1000, 
                          cat_features = ['continent', 'OS', 'subcontinent', 'country','keyword1','referral_path1'], 
                          eval_metric='RMSE',
                          random_state=2024, verbose=False)
model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x198a1f8fd90>

In [100]:
pred_test['TARGET'] = model.predict(X_test)

## <font color='forestgreen'> Modeling
- HyperParameter Tunning은 수동으로 한다.

In [None]:
lgbm

In [None]:
cat

In [None]:
knn

In [None]:
dt

In [None]:
rf

In [None]:
br

In [None]:
sgd

#### Ensemble

In [None]:
model = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
model.fit(X_train, y_train)
print('RMSE(train):',mean_squared_error(model.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model.predict(val_x), val_y)**0.5)
print('(round)RMSE:', mean_squared_error(model.predict(val_x).round(), val_y)**0.5)

In [None]:
# 직접 비율을 달리하며 Ensemble한다.

In [None]:
from sklearn import linear_model
>>> clf = linear_model.BayesianRidge()
>>> r1 = LinearRegression()
>>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
>>> r3 = KNeighborsRegressor()
>>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
>>> y = np.array([2, 6, 12, 20, 30, 42])
>>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
>>> print(er.fit(X, y).predict(X))
[ 6.8...  8.4... 12.5... 17.8... 26...  34...]

## <font color='forestgreen'> Save data

In [98]:
sample = pd.read_csv('data/submission/sample_submission.csv')
sample = sample[['sessionID']].merge(pred_test, on='sessionID', how='left').fillna(1)
sample

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,31.367976
1,SESSION_252290,1.000000
2,SESSION_252291,2.572025
3,SESSION_252292,4.389687
4,SESSION_252293,12.018117
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.797769
79784,SESSION_332073,4.681385


In [101]:
sample = pd.read_csv('data/submission/sample_submission.csv')
sample = sample[['sessionID']].merge(pred_test, on='sessionID', how='left').fillna(1)
sample

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,30.283483
1,SESSION_252290,1.000000
2,SESSION_252291,2.379057
3,SESSION_252292,4.307487
4,SESSION_252293,12.640332
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.800221
79784,SESSION_332073,4.731131


In [83]:
# LGBM
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')

./data/submission/20240302_2149 is saved.


In [102]:
# Cat
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')

./data/submission/20240302_2157 is saved.
