### Import

In [14]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import duckdb

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier

In [15]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [34]:
train_path = '/Users/leekwanghui/Downloads/open-4/train.csv'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM read_csv_auto('{train_path}')
                        WHERE Click = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()


con.close()

df.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_21140362,0,NWJHOLF,HCCBXQY,KVMAIVS,48.0,FKNTJIU,277,FFQDCNA,VAWXMCR,...,WWWEXSL,GTISJWW,5944.0,0.0,TFCIGHU,IRUDRFB,6.0,ODMETKK,0.0,JNAGMUQ
1,TRAIN_23852031,0,KJYBRUL,KCRRSKA,,2.0,EXQALLT,1,SALCDDS,FTPHMPQ,...,UVUMPZU,GTISJWW,11653.0,0.0,WCXAPAA,IRUDRFB,5.0,HFOQMFF,0.0,OLDNZBK
2,TRAIN_09278441,0,BIBZAHM,RZPPOZI,UKXCUGQ,1.0,LWZCVHD,1121,XPPYGSM,FTPHMPQ,...,IOJKKVX,GTISJWW,30172.0,0.0,XEWNKRK,IRUDRFB,,CTQCTSA,0.0,DOFQKJN
3,TRAIN_03765377,0,JCDXFYU,PILDDJU,IAGJDOH,9.0,LFPUEOV,2,PQZBVMG,OFKQGTY,...,UWZSZTZ,KHZNEZF,548.0,2.0,QMOULXS,IRUDRFB,4.0,LLWQGWG,0.0,ZDXPYZS
4,TRAIN_05261449,0,IIHVFTE,CPNINFY,IAGJDOH,425.0,BDBOJIO,2,OUJPSEV,FTPHMPQ,...,NZGEZLW,GTISJWW,243.0,0.0,QWFSOQZ,IRUDRFB,15.0,RMIWYEY,0.0,QHGYBWW


In [19]:
test = pd.read_csv('/Users/leekwanghui/Downloads/open-4/test.csv')

### EDA 1 : Sparse and Dense

In [31]:
train.head()

AttributeError: 'str' object has no attribute 'head'

### EDA 2 : Imbalance

In [36]:
click = train['Click'].value_counts(normalize=True)

click_figure = px.bar(click,
             x=['Not Clicked : 0', 'Clicked : 1'],
             y=click.values.tolist(),
             labels={'x': 'Value', 'y': 'Percentage'},
             width = 450,
             height = 500
            )

# 그래프 표시
click_figure.show()

TypeError: string indices must be integers, not 'str'

### Data Preprocessing 1 : Select x, y

In [33]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

AttributeError: 'str' object has no attribute 'drop'

### Data Preprocessing 2 : Fill NaN

In [24]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

NameError: name 'train_x' is not defined

### Data Preprocessing 3 : Count Encoding

In [23]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

NameError: name 'train_x' is not defined

### Model Setting

In [20]:
model = AdaBoostClassifier()

### Model Train and Inference

In [22]:
model.fit(X_train_encoded, train_y)

NameError: name 'X_train_encoded' is not defined

In [11]:
pred = model.predict_proba(X_test_encoded)
display(model.classes_)
display(pred)

array([0, 1])

array([[0.5045921 , 0.4954079 ],
       [0.50813471, 0.49186529],
       [0.50709193, 0.49290807],
       ...,
       [0.50982663, 0.49017337],
       [0.50648163, 0.49351837],
       [0.50587473, 0.49412527]])

### Submission

In [12]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0
1,TEST_0000001,0
2,TEST_0000002,0
3,TEST_0000003,0
4,TEST_0000004,0
...,...,...
4538536,TEST_4538536,0
4538537,TEST_4538537,0
4538538,TEST_4538538,0
4538539,TEST_4538539,0


In [13]:
sample_submission['Click'] = pred[:,1]
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0.495408
1,TEST_0000001,0.491865
2,TEST_0000002,0.492908
3,TEST_0000003,0.493060
4,TEST_0000004,0.498137
...,...,...
4538536,TEST_4538536,0.492723
4538537,TEST_4538537,0.496791
4538538,TEST_4538538,0.490173
4538539,TEST_4538539,0.493518


In [14]:
sample_submission.to_csv('baseline_submission.csv', index=False)