### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [None]:
# 제공된 train 데이터와 test 데이터를 불러옵니다
train = pd.read_csv('train.csv', nrows=15000000)
test = pd.read_csv('test.csv')

### EDA 1 : Sparse and Dense

In [None]:
train.head()

### EDA 2 : Imbalance

In [None]:
click = train['Click'].value_counts(normalize=True)

click_figure = px.bar(click,
             x=['Not Clicked : 0', 'Clicked : 1'],
             y=click.values.tolist(),
             labels={'x': 'Value', 'y': 'Percentage'},
             width = 450,
             height = 500
            )

# 그래프 표시
click_figure.show()

### Data Preprocessing 1 : Select x, y

In [None]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [None]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

### Data Preprocessing 3 : Count Encoding

In [None]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

### Model Setting

In [None]:
ada_boosting = AdaBoostClassifier()
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()

In [None]:
# VotingClassifier 정의
voting_clf_soft = VotingClassifier(
    estimators=[('ada', ada_boosting), ('rf', random_forest), ('gb', gradient_boosting)],
    voting='soft'  # 'soft' voting을 사용
)

In [None]:
# VotingClassifier 정의
voting_clf_hard = VotingClassifier(
    estimators=[('ada', ada_boosting), ('rf', random_forest), ('gb', gradient_boosting)],
    voting='hard'  # 'hard' voting을 사용
)

### Model Train and Inference

In [None]:
voting_clf_soft.fit(X_train_encoded, train_y)

In [None]:
#voting_clf_hard.fit(X_train_encoded, train_y) 실행  x

In [None]:
pred_soft = voting_clf_soft.predict_proba(X_test_encoded)
display(voting_clf_soft.classes_)
display(pred_soft)

### Submission

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

In [None]:
sample_submission['Click'] = pred_soft[:,1]
sample_submission

In [None]:
sample_submission.to_csv('baseline_submission_soft7.csv', index=False)