## << 문제 정의 >>

글로벌 쇼핑몰의 클릭 로그 데이터를 분석하여, 어떤 유저가 어떤 아이템을 클릭하는지 확인하고 싶습니다.
주어진 데이터는 해당 쇼핑몰의 2024년 4월 1일부터 2024년 5월 7일까지의 클릭 로그 데이터입니다.

2024년 4월 1일부터 4월 30일까지의 데이터를 학습하여, 그 이후 일주일간 각 아이템을 클릭하는지 안하는지를 예측하는 문제를 풀어보세요.

자세한 대회 관련 사항은 아래 대회 페이지를 참조하세요.

[Competition Page]

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(42)  # 무조건 42로 세팅!!

#파일 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')


In [4]:
#train 데이터 튜닝
from sklearn.metrics import classification_report, f1_score

train = train.drop('click_id', axis=1)

train = train.drop(['country'], axis=1)

bins = [-float('inf'), 20, 100, 1000, float('inf')]
labels = [0, 1, 2, 3]
train['price_category'] = pd.cut(train['price'], bins=bins, labels=labels, include_lowest=True).astype(int)


train['date'] = pd.to_datetime(train['date'])
train['weekday'] = train['date'].dt.weekday
train['is_weekend'] = train['weekday'].apply(lambda x: 1 if x >= 5 else 0)
train = train.drop('date', axis=1)


def fill_page1_from_page2(row):
  if pd.isna(row['page 1 (main category)']) and pd.notna(row['page 2 (clothing model)']) and row['page 2 (clothing model)'] != '':
        first_char = row['page 2 (clothing model)'][0].upper()
        if first_char == 'A':
            return 1.0
        elif first_char == 'B':
            return 2.0
        elif first_char == 'C':
            return 3.0
        elif first_char == 'P':
            return 4.0
  return row['page 1 (main category)']

train['page 1 (main category)'] = train.apply(fill_page1_from_page2, axis=1)

ordinal_enc = OrdinalEncoder(categories=[[1.0, 2.0, 3.0, 4.0]])
train['page_1_encoded'] = ordinal_enc.fit_transform(train[['page 1 (main category)']])

train['page 2 (clothing model)'] = train['page 2 (clothing model)'].fillna('missing')  # NaN을 'missing'으로 변환
train['page 2 (clothing model)'] = train['page 2 (clothing model)'].replace('', 'missing')  # 빈 문자열을 'missing'으로 변환
target_enc = train.groupby('page 2 (clothing model)')['Clicked'].mean()
train['page_2_encoded'] = train['page 2 (clothing model)'].map(target_enc)

# page_2_encoded 결측치 채우기
# 1. page 1 (main category)별 page 2 (clothing model)의 최빈값 계산
mode_map = train.groupby('page 1 (main category)')['page 2 (clothing model)'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'missing')
# 2. 최빈값에 대응하는 page_2_encoded 값 매핑
page_2_encoded_map = train.groupby('page 2 (clothing model)')['page_2_encoded'].mean()
mode_encoded_map = mode_map.map(page_2_encoded_map)
# 3. 기본값 설정 (page_2_encoded_map에 'missing'이 없으면 전체 평균 사용)
default_value = train['Clicked'].mean() if 'missing' not in page_2_encoded_map else page_2_encoded_map['missing']
# 4. 결측치 채우기 함수
def fill_page_2_encoded(row):
    if pd.isna(row['page_2_encoded']):
        return mode_encoded_map.get(row['page 1 (main category)'], default_value)
    return row['page_2_encoded']

train['page_2_encoded'] = train.apply(fill_page_2_encoded, axis=1)

bins = [-float('inf'), 3, float('inf')]
labels = [1, 2]
train['location'] = pd.cut(train['location'], bins=bins, labels=labels, include_lowest=True).astype(int)

bins = [-float('inf'), 0, 3, 5, float('inf')]
labels = [0, 1, 2, 3]
train['page'] = pd.cut(train['page'], bins=bins, labels=labels, include_lowest=True).astype(int)

bins = [-float('inf'), 2, 6, float('inf')]
labels = [0, 1, 2]
train['order'] = pd.cut(train['order'], bins=bins, labels=labels, include_lowest=True).astype(int)

train = train.drop(['session ID'], axis=1)


onehot_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
onehot_cols = onehot_enc.fit_transform(train[['model photography']])
onehot_col_names = onehot_enc.get_feature_names_out(['model photography'])
onehot_df = pd.DataFrame(onehot_cols, columns=onehot_col_names)
train = pd.concat([train, onehot_df], axis=1)

train = train.drop(['price', 'page 1 (main category)', 'page 2 (clothing model)', 'model photography'], axis=1)


In [5]:
X = train.drop('Clicked', axis=1)
y = train['Clicked']

# 훈련/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.1 colorlog-6.9.0 optuna-4.2.1


In [7]:
import optuna

def objective(trial):
    # 탐색할 하이퍼파라미터 정의
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 40),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'objective': 'binary:logistic',
        'random_state': 42
    }

    # XGBoost 모델 생성 및 학습
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    # 검증 데이터로 예측 및 정확도 계산
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

# Optuna 스터디 생성 및 최적화
study = optuna.create_study(direction='maximize')  # 정확도를 최대화
study.optimize(objective, n_trials=100)

print("최적의 파라미터:", study.best_params)
print("최고 정확도:", study.best_value)

best_params = study.best_params
XGb = XGBClassifier(**best_params, objective='binary:logistic', random_state=42)
XGb.fit(X_train, y_train)

train_pred = XGb.predict(X_train)
val_pred = XGb.predict(X_val)


# 학습한 모델을 평가
from sklearn.metrics import accuracy_score

print("Train ACC : %.4f" % accuracy_score(y_train, train_pred))
print("Val ACC : %.4f" % accuracy_score(y_val, val_pred))

[I 2025-03-26 13:41:26,913] A new study created in memory with name: no-name-788afd08-777b-400f-a153-6e34bda49468
[I 2025-03-26 13:41:39,716] Trial 0 finished with value: 0.649896265560166 and parameters: {'max_depth': 39, 'learning_rate': 0.09928189340299635, 'n_estimators': 115, 'subsample': 0.7765630756052073, 'colsample_bytree': 0.6697668951144122}. Best is trial 0 with value: 0.649896265560166.
[I 2025-03-26 13:41:50,472] Trial 1 finished with value: 0.6684647302904564 and parameters: {'max_depth': 6, 'learning_rate': 0.24728322899573799, 'n_estimators': 251, 'subsample': 0.5440678429850238, 'colsample_bytree': 0.7188657540018195}. Best is trial 1 with value: 0.6684647302904564.
[I 2025-03-26 13:42:06,660] Trial 2 finished with value: 0.645746887966805 and parameters: {'max_depth': 22, 'learning_rate': 0.1963079932775404, 'n_estimators': 228, 'subsample': 0.9725472420315964, 'colsample_bytree': 0.6024094449141035}. Best is trial 1 with value: 0.6684647302904564.
[I 2025-03-26 13:4

최적의 파라미터: {'max_depth': 4, 'learning_rate': 0.12523623932550687, 'n_estimators': 67, 'subsample': 0.8545191210364028, 'colsample_bytree': 0.974846567452876}
최고 정확도: 0.6838174273858921
Train ACC : 0.6864
Val ACC : 0.6838


In [8]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# test 데이터에서 click_id 제거
test = test.drop('click_id', axis=1)

test = test.drop(['country'], axis=1)

# price_category 생성
bins = [-float('inf'), 20, 100, 1000, float('inf')]
labels = [0, 1, 2, 3]
test['price_category'] = pd.cut(test['price'], bins=bins, labels=labels, include_lowest=True).astype(int)

# date 관련 피처 생성
test['date'] = pd.to_datetime(test['date'])
test['weekday'] = test['date'].dt.weekday
test['is_weekend'] = test['weekday'].apply(lambda x: 1 if x >= 5 else 0)
test = test.drop('date', axis=1)

# page 1 (main category) 결측치 채우기
def fill_page1_from_page2(row):
    if pd.isna(row['page 1 (main category)']) and pd.notna(row['page 2 (clothing model)']) and row['page 2 (clothing model)'] != '':
        first_char = row['page 2 (clothing model)'][0].upper()
        if first_char == 'A':
            return 1.0
        elif first_char == 'B':
            return 2.0
        elif first_char == 'C':
            return 3.0
        elif first_char == 'P':
            return 4.0
    return row['page 1 (main category)']

test['page 1 (main category)'] = test.apply(fill_page1_from_page2, axis=1)

# page_1_encoded 생성 (train에서 학습된 ordinal_enc 사용)
test['page_1_encoded'] = ordinal_enc.transform(test[['page 1 (main category)']])

# page_2_encoded 생성 (train에서 계산된 target_enc 사용)
test['page 2 (clothing model)'] = test['page 2 (clothing model)'].fillna('missing')  # NaN을 'missing'으로 변환
test['page 2 (clothing model)'] = test['page 2 (clothing model)'].replace('', 'missing')  # 빈 문자열을 'missing'으로 변환
test['page_2_encoded'] = test['page 2 (clothing model)'].map(target_enc)

# page_2_encoded 결측치 채우기 (train에서 계산된 mode_encoded_map과 default_value 사용)
def fill_page_2_encoded(row):
    if pd.isna(row['page_2_encoded']):
        return mode_encoded_map.get(row['page 1 (main category)'], default_value)
    return row['page_2_encoded']

test['page_2_encoded'] = test.apply(fill_page_2_encoded, axis=1)

test = test.drop(['session ID'], axis=1)

bins = [-float('inf'), 3, float('inf')]
labels = [1, 2]
test['location'] = pd.cut(test['location'], bins=bins, labels=labels, include_lowest=True).astype(int)

bins = [-float('inf'), 0, 3, 5, float('inf')]
labels = [0, 1, 2, 3]
test['page'] = pd.cut(test['page'], bins=bins, labels=labels, include_lowest=True).astype(int)

bins = [-float('inf'), 2, 6, float('inf')]
labels = [0, 1, 2]
test['order'] = pd.cut(test['order'], bins=bins, labels=labels, include_lowest=True).astype(int)

# One-Hot Encoding (train에서 학습된 onehot_enc 사용)
onehot_cols = onehot_enc.transform(test[['model photography']])
onehot_col_names = onehot_enc.get_feature_names_out(['model photography'])
onehot_df = pd.DataFrame(onehot_cols, columns=onehot_col_names)
test = pd.concat([test, onehot_df], axis=1)

# 불필요한 열 제거
test = test.drop(['price', 'page 1 (main category)', 'page 2 (clothing model)', 'model photography'], axis=1)

In [9]:
X_test = test

output = XGb.predict(X_test)
assert len(output) == 7695  # sanity check

#파일 만들기

submission = pd.read_csv('sample_submission.csv')
submission['Clicked'] = output
submission.to_csv('submission.csv', index=False)