## Import

In [1]:
import pandas as pd
import numpy as np ; np.random.seed(2024)
import warnings;warnings.filterwarnings(action='ignore')

# Feature Engineering
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

# Modeling
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor, BayesianRidge
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

## Read data

In [2]:
train = pd.read_csv('data/stayed_train.csv')
test = pd.read_csv('data/stayed_test.csv')
print('학습데이터 수:', train.shape)
print('평가데이터 수:', test.shape)

학습데이터 수: (127739, 20)
평가데이터 수: (40148, 19)


In [3]:
'''
데이터 설명
- sessionID : 세션 ID
- userID : 사용자 ID
- TARGET : 세션에서 발생한 총 조회수
- browser : 사용된 브라우저
- OS : 사용된 기기의 운영체제
- device : 사용된 기기
- new : 첫 방문 여부 (0: 첫 방문 아님, 1: 첫 방문)
- quality : 세션의 질 (거래 성사를 기준으로 측정된 값, 범위: 1~100)
- duration : 총 세션 시간 (단위: 초)
- bounced : 이탈 여부 (0: 이탈하지 않음, 1: 이탈함)
- transaction : 세션 내에서 발생의 거래의 수
- transaction_revenue : 총 거래 수익
- continent : 세션이 발생한 대륙
- subcontinent : 세션이 발생한 하위 대륙
- country : 세션이 발생한 국가
- traffic_source : 트래픽이 발생한 소스
- traffic_medium : 트래픽 소스의 매체
- keyword : 트래픽 소스의 키워드, 일반적으로 traffic_medium이 organic, cpc인 경우에 설정
- referral_path : traffic_medium이 referral인 경우 설정되는 경로
'''
display(train.head())

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword1,keyword2,referral_path1,referral_path2
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,(not set),(not set),(not set)
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
2,SESSION_000007,USER_000007,5.0,Chrome,Macintosh,desktop,1,1.0,64.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
3,SESSION_000008,USER_000008,5.0,Firefox,Linux,desktop,1,1.0,60.0,0.0,0.0,Americas,South America,Brazil,youtube.com,referral,(not set),(not set),Category5,0002
4,SESSION_000009,USER_000009,3.0,Chrome,Macintosh,desktop,1,2.0,579.0,0.0,0.0,Americas,Northern America,United States,google,ad,Category1,000,(not set),(not set)


## <font color='forestgreen'> Data Cleansing

In [4]:
# browser 내 device 정보가 섞여 범주값이 다양하다. 학습데이터를 기반으로 재구성한다.
# 결측치로 생각하는 (not set)도 포함한다.
unclear_browser = ["Mozilla Compatible Agent", "MRCHROME", "+Simple Browser",
                   'SeaMonkey', 'osee2unifiedRelease', 'YE', 'Browser', 'starmaker', '(not set)']

train['browser'] = train['browser'].apply(lambda x: 'Safari' if 'Safari' in x else
                                                     ('Opera' if 'Opera' in x else
                                                     ('Amazon' if 'Amazon' in x else
                                                     ('BROWSER' if x in unclear_browser else x))))
test['browser'] = test['browser'].apply(lambda x: 'Safari' if 'Safari' in x else
                                                     ('Opera' if 'Opera' in x else
                                                     ('Amazon' if 'Amazon' in x else
                                                     ('BROWSER' if x in unclear_browser else x))))

In [5]:
# 같은 이유로 OS도 재구성한다.
train['OS'] = train['OS'].apply(lambda x: 'Windows' if 'Windows' in x else
                                           ('Nintendo' if 'Nintendo' in x else                                
                                           ('OS' if x in ['OS/2','(not set)'] else x)))
test['OS'] = test['OS'].apply(lambda x: 'Windows' if 'Windows' in x else
                                           ('Nintendo' if 'Nintendo' in x else                                
                                           ('OS' if x in ['OS/2','(not set)'] else x)))

In [6]:
# 제도, 섬을 Islands로 변경하고 방향을 의미하는 변수를 생성한다.
island = ['Caribbean', 'Micronesian Region', 'Polynesia', 'Melanesia']
train['subcontinent'] = train['subcontinent'].apply(lambda x: 'Islands' if x in island else x)
test['subcontinent'] = test['subcontinent'].apply(lambda x: 'Islands' if x in island else x)

# continent, subcontinent, country, direction에서 결측치는 (not set)으로 나타낸다.
train['direction'] = train['subcontinent'].str.split(' ', expand=True)[0].replace('(not', '(not set)')
test['direction'] = test['subcontinent'].str.split(' ', expand=True)[0].replace('(not', '(not set)')

In [7]:
# traffic_source를 지역 도메인을 제외해 재구성한다.
# google엔 analytics, mail, groups, adwords & ads, optimize, docs, earth 등이 섞여있다.
train['traffic_source'] = train.traffic_source.str.replace('.com|.net|.org', '')\
                          .apply(lambda x: 'yahoo' if '.yahoo' in x else
                                            ('facebook' if '.facebook' in x else
                                            ('youtube' if '.youtube' in x else
                                            ('pinterest' if '.pinterest' in x else
                                            ('edu' if '.edu' in x else
                                            ('ask' if '.ask' in x else
                                            ('reddit' if '.reddit' in x else
                                            ('google' if 'google' in x else x))))))))

test['traffic_source'] = test.traffic_source.str.replace('.com|.net|.org', '')\
                         .apply(lambda x: 'yahoo' if '.yahoo' in x else
                                            ('facebook' if '.facebook' in x else
                                            ('youtube' if '.youtube' in x else
                                            ('pinterest' if '.pinterest' in x else
                                            ('edu' if '.edu' in x else
                                            ('ask' if '.ask' in x else
                                            ('reddit' if '.reddit' in x else
                                            ('google' if 'google' in x else x))))))))

In [8]:
youtube = ['Category2', 'Category4', 'Category5', 'Category7', 'Category8', 'Category13']
train['referral_path3'] = train['referral_path1'].apply(lambda x: 'youtube' if x in youtube else
                                                                 ('google' if x == "Category9" else
                                                                 ('(direct)' if x == "Category10" else x)))

test['referral_path3'] = test['referral_path1'].apply(lambda x: 'youtube' if x in youtube else
                                                                 ('google' if x == "Category9" else
                                                                 ('(direct)' if x == "Category10" else x)))

## <font color='forestgreen'> Feature Engineering

### Feature Generation

In [9]:
# stayed 데이터는 변수를 생성한다.
train['QD'] = train['quality'] * train['duration']
test['QD'] = test['quality'] * test['duration']

In [10]:
# stayed 데이터는 변수를 생성한다.
train['QN'] = train['quality'] * train['new']
test['QN'] = test['quality'] * test['new']

In [11]:
# stayed 데이터는 변수를 생성한다.
train['DN'] = train['duration'] * train['new']
test['DN'] = test['duration'] * test['new']

### Encoding
- traffic_source, traffic_medium을 세트로 labeling
- os, device

In [12]:
train.fillna('(not set)', inplace=True)
test.fillna('(not set)', inplace=True)

In [13]:
# keyword1, referral_path1은 각 2의 빈도로 채운다.
replace_count = ['keyword','referral_path']

for i in replace_count:
    count = train.groupby(f'{i}1')[f'{i}2'].size()
    train[f'{i}1'] = train[f'{i}1'].map(count)
    test[f'{i}1'] = test[f'{i}1'].map(count)
    test[f'{i}1'].fillna(int(count.mean()))

In [14]:
# 학습데이터에 한하여 Target encoding 한다.
# keyword, referral_path
target = ['browser', 'device', 'traffic_source', 'traffic_medium','direction','keyword2', 'referral_path2', 'referral_path3']
                        
for i in target:
    rate = train.groupby(i)['TARGET'].mean()
    train[i] = train[i].map(rate)
    test[i] = test[i].map(rate)
    test[i].fillna(train[i].mean(), inplace=True)

In [15]:
str_col = ['continent', 'new', 'OS', 'subcontinent', 'country']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

### Feature Transformation

In [16]:
# sqrt transformation을 시도한다.
train['duration'] = train['duration']**0.5
test['duration'] = test['duration']**0.5

In [17]:
# log transformation을 시도한다.
train['quality'] = np.log(train['quality'])
test['quality'] = np.log(test['quality'])

In [18]:
# Scaler
numeric = ['quality', 'duration', 'transaction', 'transaction_revenue', 'QD', 'QN','DN']

scaler = MinMaxScaler()
train[numeric] = scaler.fit_transform(train[numeric])
test[numeric] = scaler.transform(test[numeric])

### Feature Selection

In [19]:
# ID열을 제거한다.
pred_test= test[['sessionID']]

# sessionID 변수를 제거한다.
train.drop(['sessionID'], axis=1, inplace=True)
test.drop(['sessionID'], axis=1, inplace=True)

In [20]:
train.head()

Unnamed: 0,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,...,traffic_medium,keyword1,keyword2,referral_path1,referral_path2,direction,referral_path3,QD,QN,DN
0,USER_000000,17.0,6.577421,6,6.480987,0,0.830249,0.273869,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,6.576706,6.469242,0.044165,0.0,0.0
1,USER_000001,3.0,6.577421,12,6.480987,1,0.0,0.059047,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,5.127151,6.469242,4.6e-05,0.01087,0.00377
2,USER_000007,5.0,6.577421,6,6.480987,1,0.0,0.07564,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,5.127151,6.469242,7.5e-05,0.01087,0.006186
3,USER_000008,5.0,5.848352,5,6.480987,1,0.0,0.073238,0.0,0.0,...,4.45094,70397,6.095371,1985,3.303922,5.808008,3.780194,7e-05,0.01087,0.005799
4,USER_000009,3.0,6.577421,6,6.480987,1,0.151178,0.227511,0.0,0.0,...,6.598886,906,7.883721,79590,6.681714,6.576706,6.469242,0.001355,0.021739,0.055964


In [34]:
unuse = ['userID','keyword2']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model_BR = BayesianRidge(n_iter=500, compute_score=True)
model_BR.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model_BR.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model_BR.predict(val_x), val_y)**0.5)

RMSE(train): 4.368978466011257
RMSE: 4.439142446309799


In [41]:
# 심각한 과적합
unuse = ['userID','new']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model_RF = RandomForestRegressor(n_estimators=50, random_state=2024)
model_RF.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model_RF.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model_RF.predict(val_x), val_y)**0.5)

RMSE(train): 1.4857809714573673
RMSE: 3.8550106464801646


In [40]:
pd.DataFrame([X_train.columns, model_RF.feature_importances_]).T

Unnamed: 0,0,1
0,browser,0.015103
1,OS,0.02159
2,device,0.012533
3,quality,0.101658
4,duration,0.058943
5,transaction,0.003527
6,transaction_revenue,0.01081
7,continent,0.011233
8,subcontinent,0.011375
9,country,0.021357


In [42]:
unuse = ['userID']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model_KNN = KNeighborsRegressor()
model_KNN.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model_KNN.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model_KNN.predict(val_x), val_y)**0.5)

RMSE(train): 3.7170245662627566
RMSE: 4.69505212221902


In [70]:
unuse = ['userID','transaction_revenue','keyword2','referral_path1','new']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model_DT = DecisionTreeRegressor(max_depth=7,max_leaf_nodes=100,random_state=2024)
model_DT.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model_DT.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model_DT.predict(val_x), val_y)**0.5)

RMSE(train): 3.7501849742224285
RMSE: 4.183629289815745


In [63]:
pd.DataFrame([X_train.columns, model_DT.feature_importances_]).T

Unnamed: 0,0,1
0,browser,0.002831
1,OS,0.002854
2,device,0.009787
3,new,0.000657
4,quality,0.029413
5,duration,0.020391
6,transaction,0.002213
7,transaction_revenue,0.0
8,continent,0.003371
9,subcontinent,0.004124


## <font color='forestgreen'> HyperParameter Tunning

In [62]:
X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
model_BR = BayesianRidge(n_iter=500, compute_score=True)
model_BR.fit(X_train, y_train)
BR_pred = model_BR.predict(X_test)

In [71]:
unuse = ['userID','transaction_revenue','keyword2','referral_path1','new']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
model_DT = DecisionTreeRegressor(max_depth=7,max_leaf_nodes=100,random_state=2024)
model_DT.fit(X_train, y_train)
DT_pred = model_DT.predict(X_test)

## <font color='forestgreen'> Modeling
- HyperParameter Tunning은 수동으로 한다.

#### Ensemble

In [72]:
# sample data를 부른다.
sample = pd.read_csv('data/submission/sample_submission.csv')

In [73]:
# score가 높은 예측값을 부른다.
cat = pd.read_csv('data/submission/20240302_2157.csv')
lgbm = pd.read_csv('data/submission/20240227_1641.csv')

In [76]:
pred_test['TARGET'] = BR_pred
BR_pred = sample[['sessionID']].merge(pred_test, on='sessionID', how='left').fillna(1)
BR_pred

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,21.990758
1,SESSION_252290,1.000000
2,SESSION_252291,3.788595
3,SESSION_252292,3.795592
4,SESSION_252293,9.100934
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,5.353645
79784,SESSION_332073,4.085855


In [77]:
pred_test['TARGET'] = DT_pred
DT_pred = sample[['sessionID']].merge(pred_test, on='sessionID', how='left').fillna(1)
DT_pred

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,21.352258
1,SESSION_252290,1.000000
2,SESSION_252291,2.605878
3,SESSION_252292,4.630029
4,SESSION_252293,8.480337
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.524621
79784,SESSION_332073,4.630029


In [78]:
cat

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,30.283483
1,SESSION_252290,1.000000
2,SESSION_252291,2.379057
3,SESSION_252292,4.307487
4,SESSION_252293,12.640332
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.800221
79784,SESSION_332073,4.731131


In [4]:
lgbm

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,26.615789
1,SESSION_252290,1.000000
2,SESSION_252291,2.729949
3,SESSION_252292,4.265513
4,SESSION_252293,10.715498
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.758734
79784,SESSION_332073,4.745194


## Ensemble

In [88]:
# public: 
sample['TARGET'] = cat['TARGET']*0.45+lgbm['TARGET']*0.45+BR_pred['TARGET']*0.05+DT_pred['TARGET']*0.05

In [101]:
# public: 
sample['TARGET'] = cat['TARGET']*0.8+BR_pred['TARGET']*0.1+DT_pred['TARGET']*0.1

In [103]:
# public: 
sample['TARGET'] = cat['TARGET']*0.6+lgbm['TARGET']*0.4

In [105]:
mean_squared_error(sample['TARGET'], lgbm['TARGET'])**0.5

0.4489317907272911

In [99]:
mean_squared_error(lgbm['TARGET'], cat['TARGET'])**0.5

0.7482196512121518

In [104]:
sample

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,28.816405
1,SESSION_252290,1.000000
2,SESSION_252291,2.519413
3,SESSION_252292,4.290697
4,SESSION_252293,11.870398
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.783626
79784,SESSION_332073,4.736756


## Save data

In [102]:
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')

./data/submission/20240303_2320 is saved.


In [None]:
model = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
model.fit(X_train, y_train)
print('RMSE(train):',mean_squared_error(model.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model.predict(val_x), val_y)**0.5)
print('(round)RMSE:', mean_squared_error(model.predict(val_x).round(), val_y)**0.5)

In [None]:
# 직접 비율을 달리하며 Ensemble한다.

In [None]:
from sklearn import linear_model
>>> clf = linear_model.BayesianRidge()
>>> r1 = LinearRegression()
>>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
>>> r3 = KNeighborsRegressor()
>>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
>>> y = np.array([2, 6, 12, 20, 30, 42])
>>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
>>> print(er.fit(X, y).predict(X))
[ 6.8...  8.4... 12.5... 17.8... 26...  34...]

## <font color='forestgreen'> Save data

In [98]:
sample = pd.read_csv('data/submission/sample_submission.csv')
sample = sample[['sessionID']].merge(pred_test, on='sessionID', how='left').fillna(1)
sample

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,31.367976
1,SESSION_252290,1.000000
2,SESSION_252291,2.572025
3,SESSION_252292,4.389687
4,SESSION_252293,12.018117
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.797769
79784,SESSION_332073,4.681385


In [101]:
sample = pd.read_csv('data/submission/sample_submission.csv')
sample = sample[['sessionID']].merge(pred_test, on='sessionID', how='left').fillna(1)
sample

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,30.283483
1,SESSION_252290,1.000000
2,SESSION_252291,2.379057
3,SESSION_252292,4.307487
4,SESSION_252293,12.640332
...,...,...
79781,SESSION_332070,1.000000
79782,SESSION_332071,1.000000
79783,SESSION_332072,2.800221
79784,SESSION_332073,4.731131


In [83]:
# LGBM
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')

./data/submission/20240302_2149 is saved.


In [102]:
# Cat
date = str(pd.Timestamp.now())[:16].replace('-','').replace(' ','_').replace(':','')
filename = f'./data/submission/{date}'
sample.to_csv(f'{filename}.csv', index=False)
print(f'{filename} is saved.')

./data/submission/20240302_2157 is saved.
