## << 문제 정의 >>

글로벌 쇼핑몰의 클릭 로그 데이터를 분석하여, 어떤 유저가 어떤 아이템을 클릭하는지 확인하고 싶습니다.
주어진 데이터는 해당 쇼핑몰의 2024년 4월 1일부터 2024년 5월 7일까지의 클릭 로그 데이터입니다.

2024년 4월 1일부터 4월 30일까지의 데이터를 학습하여, 그 이후 일주일간 각 아이템을 클릭하는지 안하는지를 예측하는 문제를 풀어보세요.

자세한 대회 관련 사항은 아래 대회 페이지를 참조하세요.

[Competition Page]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import warnings
warnings.filterwarnings('ignore')

# from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from sklearn.model_selection import GridSearchCV
# from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(42)  # 무조건 42로 세팅!!

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
train.shape, test.shape, submission.shape

((48199, 13), (7695, 12), (7695, 2))

In [None]:
train.isnull().sum()

Unnamed: 0,0
click_id,0
date,0
order,0
country,0
session ID,0
page 1 (main category),31
page 2 (clothing model),2135
colour,0
location,0
model photography,0


In [None]:
# train['country'].value_counts()

In [None]:
train['session_count'] = train.groupby('session ID')['session ID'].transform('count')
train['order_ratio'] = train['order'] / train['session_count'] * 4
train['order_ratio'] = [int(x) for x in train['order_ratio']]

In [None]:
train.dropna(subset = ['page 1 (main category)'], axis = 0)

train['mdp'] = train['model photography']
train['categ'] = train['page 1 (main category)']

train['postop'] = [1 if x <= 3 else 0 for x in train['location']]
train['posl'] = [x % 3 for x in train['location']]

In [None]:
# train['page 1 (main category)'] = train['page 1 (main category)'].fillna(train['page 1 (main category)'].mode().values[0])
# train['page 2 (clothing model)'] = train['page 2 (clothing model)'].fillna(train['page 2 (clothing model)'].mode().values[0])
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48199 entries, 0 to 48198
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   click_id                 48199 non-null  int64  
 1   date                     48199 non-null  object 
 2   order                    48199 non-null  int64  
 3   country                  48199 non-null  int64  
 4   session ID               48199 non-null  int64  
 5   page 1 (main category)   48168 non-null  float64
 6   page 2 (clothing model)  46064 non-null  object 
 7   colour                   48199 non-null  int64  
 8   location                 48199 non-null  int64  
 9   model photography        48199 non-null  int64  
 10  page                     48199 non-null  int64  
 11  price                    48199 non-null  float64
 12  Clicked                  48199 non-null  int64  
 13  session_count            48199 non-null  int64  
 14  order_ratio           

In [None]:
# train['page 2 (clothing model)'] = pd.factorize(train['page 2 (clothing model)'])[0]
# train

In [None]:
dropcol = ['click_id', 'date', 'session ID', 'page 1 (main category)', 'page 2 (clothing model)','location', 'model photography']

# feature vector
X = train.drop(columns=dropcol)
X = X.drop(columns = ["Clicked"])
# target value
y = train.Clicked

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

X

(38559, 11) (9640, 11) (38559,) (9640,)


Unnamed: 0,order,country,colour,page,price,session_count,order_ratio,mdp,categ,postop,posl
0,1,29,1,1,39.0,9,0,1,1.0,0,2
1,2,29,1,1,40.0,9,0,1,1.0,0,0
2,3,29,10,1,46.0,9,1,1,2.0,1,2
3,4,29,6,1,30.0,9,1,2,2.0,0,0
4,5,29,4,1,41.0,9,2,2,2.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
48194,38,29,6,2,39.0,41,3,2,2.0,0,2
48195,39,29,2,2,46.0,41,3,1,2.0,0,0
48196,40,29,4,1,49.0,41,3,1,4.0,1,2
48197,41,29,12,5,49.0,41,4,2,4.0,1,1


In [None]:
ftype = ['numerical', 'categorial', 'categorial', 'numerical', 'numerical', 'numerical', 'numerical', 'categorial', 'categorial', 'categorial', 'categorial']

# xgbc = XGBClassifier(random_state = 42,
#                      learning_rate = 0.01,
#                      colsample_bytree = 0.5,
#                      colsample_bylevel = 0.5,
#                      reg_lambda = 7,
#                      reg_alpha = 5,
#                      n_estimators = 1100,
#                      max_depth = 6,
#                      subsample = 0.5,
#                      gamma = 1,
#                      min_child_weight = 6,
#                      feature_type = ftype)

xgbc = XGBClassifier(random_state = 42,
                     learning_rate = 0.08,
                     colsample_bytree = 0.7,
                     colsample_bylevel = 0.7,
                     max_depth = 6,
                     n_estimators = 200,
                     feature_type = ftype,
                     subsample = 1,
                     min_child_weight = 5
                     )

xgbc.fit(X_train, y_train)

In [None]:
train_pred = xgbc.predict(X_train)
val_pred = xgbc.predict(X_val)

print("Train ACC : %.4f" % accuracy_score(y_train, train_pred))
print("Val ACC : %.4f" % accuracy_score(y_val, val_pred))

Train ACC : 0.7016
Val ACC : 0.6821


In [None]:
test['session_count'] = test.groupby('session ID')['session ID'].transform('count')

test['order_ratio'] = test['order'] / test['session_count'] * 4
test['order_ratio'] = [int(x) for x in test['order_ratio']]

In [None]:
# X_test를 만들어서 아래 코드를 실행하세요.
test['mdp'] = test['model photography']
test['categ'] = test['page 1 (main category)']

test['postop'] = [1 if x <= 3 else 0 for x in test['location']]
test['posl'] = [x % 3 for x in test['location']]

X_test = test.drop(columns=dropcol)
# X_test['page 2 (clothing model)'] = pd.factorize(X_test['page 2 (clothing model)'])[0]

output = xgbc.predict(X_test)
assert len(output) == 7695  # sanity check

In [None]:
# submission DataFrame을 만들어서 아래 코드를 실행하세요.
submission = pd.read_csv('sample_submission.csv')
submission['Clicked'] = output
submission.to_csv('submission.csv', index=False)