## Import

In [52]:
import pandas as pd
import numpy as np ; np.random.seed(2024)
import warnings;warnings.filterwarnings(action='ignore')

# Feature Engineering
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

# Modeling
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Read data

In [53]:
train = pd.read_csv('data/stayed_train.csv')
test = pd.read_csv('data/stayed_test.csv')
print('학습데이터 수:', train.shape)
print('평가데이터 수:', test.shape)

학습데이터 수: (127739, 20)
평가데이터 수: (40148, 19)


In [54]:
display(train.head())

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword1,keyword2,referral_path1,referral_path2
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,(not set),(not set),(not set)
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
2,SESSION_000007,USER_000007,5.0,Chrome,Macintosh,desktop,1,1.0,64.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
3,SESSION_000008,USER_000008,5.0,Firefox,Linux,desktop,1,1.0,60.0,0.0,0.0,Americas,South America,Brazil,youtube.com,referral,(not set),(not set),Category5,0002
4,SESSION_000009,USER_000009,3.0,Chrome,Macintosh,desktop,1,2.0,579.0,0.0,0.0,Americas,Northern America,United States,google,ad,Category1,000,(not set),(not set)


### Categorical Feature Encoding

In [64]:
# keyword1, referral_path1은 각 keyword2, referral_path2의 빈도로 채운다.
replace_count = ['keyword','referral_path']

for i in replace_count:
    count = train.groupby(f'{i}1')[f'{i}2'].size()
    train[f'{i}1'] = train[f'{i}1'].map(count)
    test[f'{i}1'] = test[f'{i}1'].map(count)
    test[f'{i}1'].fillna(int(count.mean()))

In [65]:
# 학습데이터에 한하여 Target encoding 한다.
# keyword, referral_path
target = ['browser', 'device', 'traffic_source', 'traffic_medium','direction','keyword2', 'referral_path2', 'referral_path3']
                        
for i in target:
    rate = train.groupby(i)['TARGET'].mean()
    train[i] = train[i].map(rate)
    test[i] = test[i].map(rate)
    test[i].fillna(train[i].mean(), inplace=True)

In [66]:
str_col = ['continent', 'new', 'OS', 'subcontinent', 'country']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

### Feature Transformation
Transformation을 달리 해 두 Feature Set을 만든다.

In [None]:
train_1, test_1 = train.copy(), test.copy()

# sqrt transformation
train_1['duration'] = train_1['duration']**0.5
test_1['duration'] = test_1['duration']**0.5

train_1['quality'] = train_1['quality']**0.5
test_1['quality'] = test_1['quality']**0.5

In [None]:
train_2, test_2 = train.copy(), test.copy()

# sqrt transformation을 시도한다.
train_2['duration'] = train_2['duration']**0.5
test_2['duration'] = test_2['duration']**0.5

# log transformation을 시도한다.
train_2['quality'] = np.log(train_2['quality'])
test_2['quality'] = np.log(test_2['quality'])

## Save data

In [15]:
train_1.to_csv('../data/feature_train_1.csv', index=False)
test_1.to_csv('../data/feature_test_1.csv', index=False)

In [15]:
train_2.to_csv('../data/feature_train_2.csv', index=False)
test_2.to_csv('../data/feature_test_2.csv', index=False)