## Import

In [52]:
import pandas as pd
import numpy as np ; np.random.seed(2024)
import warnings;warnings.filterwarnings(action='ignore')

# Modeling
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Read data

In [53]:
train = pd.read_csv('../data/feature_train_1.csv')
test = pd.read_csv('../data/feature_test_1.csv')
print('학습데이터 수:', train.shape)
print('평가데이터 수:', test.shape)

학습데이터 수: (127739, 20)
평가데이터 수: (40148, 19)


In [54]:
display(train.head())

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword1,keyword2,referral_path1,referral_path2
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,(not set),(not set),(not set)
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
2,SESSION_000007,USER_000007,5.0,Chrome,Macintosh,desktop,1,1.0,64.0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,(not set),(not set),(not set)
3,SESSION_000008,USER_000008,5.0,Firefox,Linux,desktop,1,1.0,60.0,0.0,0.0,Americas,South America,Brazil,youtube.com,referral,(not set),(not set),Category5,0002
4,SESSION_000009,USER_000009,3.0,Chrome,Macintosh,desktop,1,2.0,579.0,0.0,0.0,Americas,Northern America,United States,google,ad,Category1,000,(not set),(not set)


### Feature Selection

In [69]:
# ID열을 제거한다.
pred_test= test[['sessionID']]

# sessionID 변수를 제거한다.
train.drop(['sessionID'], axis=1, inplace=True)
test.drop(['sessionID'], axis=1, inplace=True)

In [70]:
train.head()

Unnamed: 0,userID,TARGET,browser,OS,device,new,quality,duration,transaction,transaction_revenue,...,traffic_medium,keyword1,keyword2,referral_path1,referral_path2,direction,referral_path3,QD,QN,DN
0,USER_000000,17.0,6.577421,6,6.480987,0,3.806662,28.965497,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,6.576706,6.469242,37755.0,0.0,0.0
1,USER_000001,3.0,6.577421,12,6.480987,1,0.0,6.244998,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,5.127151,6.469242,39.0,1.0,39.0
2,USER_000007,5.0,6.577421,6,6.480987,1,0.0,8.0,0.0,0.0,...,6.52449,53551,6.095371,79590,6.681714,5.127151,6.469242,64.0,1.0,64.0
3,USER_000008,5.0,5.848352,5,6.480987,1,0.0,7.745967,0.0,0.0,...,4.45094,70397,6.095371,1985,3.303922,5.808008,3.780194,60.0,1.0,60.0
4,USER_000009,3.0,6.577421,6,6.480987,1,0.693147,24.062419,0.0,0.0,...,6.598886,906,7.883721,79590,6.681714,6.576706,6.469242,1158.0,2.0,579.0


In [93]:
unuse = ['transaction','userID']

X_train, y_train = train.drop(['TARGET']+unuse, axis=1), train['TARGET'].astype('i')
X_test = test.drop(unuse, axis=1)
tr_x, val_x, tr_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2024)
model = CatBoostRegressor(iterations=500, cat_features = ['continent', 'OS', 'subcontinent', 'country','keyword1','referral_path1'], 
                          objective='Poisson', eval_metric='RMSE',
                          random_state=2024, verbose=False)
model.fit(tr_x, tr_y)
print('RMSE(train):',mean_squared_error(model.predict(tr_x), tr_y)**0.5)
print('RMSE:', mean_squared_error(model.predict(val_x), val_y)**0.5)

RMSE(train): 3.3187519346696193
RMSE: 3.6986404761724585


In [44]:
# Catboost를 base로 삼고 importance가 낮은 feature는 제거한다.
importance = pd.DataFrame({'feature': stayed_model.feature_names_,
                           'importance':stayed_model.feature_importances_})
importance

Unnamed: 0,feature,importance
0,browser,1.578933
1,OS,3.410198
2,device,1.056636
3,new,4.484673
4,quality,46.001285
5,duration,18.226717
6,transaction_revenue,2.258117
7,continent,2.821869
8,subcontinent,8.013138
9,country,0.323489


## Save data

In [15]:
train.to_csv('../data/feature_train_1.csv', index=False)
test.to_csv('../data/feature_test_1.csv', index=False)