In [1]:
import pandas as pd
from pycaret.classification import *
from time import time

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')

d_code = pd.read_csv('data/속성_D_코드.csv', index_col=0).T.to_dict()
h_code = pd.read_csv('data/속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('data/속성_L_코드.csv', index_col=0).T.to_dict()

pd.set_option('display.max_columns', None)
train['target'] = train['target'].astype('category')

for data in [train, test]:
    data.drop(['id','person_rn','person_prefer_f','person_prefer_g','contents_rn','contents_open_dt'], axis=1, inplace=True)

In [2]:
train.shape, test.shape

((501951, 29), (46404, 28))

In [3]:
def add_code(
    df: pd.DataFrame,
    d_code: Dict[int, Dict[str, int]], 
    h_code: Dict[int, Dict[str, int]], 
    l_code: Dict[int, Dict[str, int]],
) -> pd.DataFrame:
    
    # Copy input data
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df = df.drop(['person_prefer_d_1','person_prefer_d_2','person_prefer_d_3','contents_attribute_d'], axis=1)

    # H Code
    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df = df.drop(['person_prefer_h_1','person_prefer_h_2','person_prefer_h_3','contents_attribute_h'], axis=1)

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])

    df = df.drop(['contents_attribute_l'], axis=1)
    
    return df

train = add_code(train, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)

cat_features = [i for i in train.columns if 'match' in i]
num_features = [i for i in train.columns if i not in cat_features+['target']]

del d_code, h_code, l_code

In [4]:
cell_start_time = time()
param_dict = {
    'data': train,
    'target': 'target',
    'train_size': 0.80,
    'numeric_features': [i for i in train.columns if i not in ['target']],
    'data_split_shuffle': True,
    'fold': 10,
    'fold_shuffle': True,
    'n_jobs': -1,
    'verbose': True,
    'silent': True,
    'session_id': 42
}
del train

exp = setup(**param_dict)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

lgb = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6148,0.6603,0.6662,0.6038,0.6335,0.2297,0.2309
1,0.6162,0.6616,0.6658,0.6054,0.6342,0.2325,0.2337
2,0.613,0.66,0.6641,0.6022,0.6316,0.2261,0.2273
3,0.6127,0.6586,0.6638,0.6019,0.6313,0.2254,0.2266
4,0.6146,0.6643,0.6694,0.6029,0.6344,0.2292,0.2306
5,0.608,0.6527,0.6615,0.5972,0.6277,0.2161,0.2173
6,0.6126,0.6587,0.6667,0.6013,0.6323,0.2253,0.2267
7,0.6147,0.6596,0.6695,0.603,0.6345,0.2294,0.2308
8,0.6138,0.6608,0.6684,0.6022,0.6336,0.2277,0.2291
9,0.6134,0.661,0.6678,0.602,0.6332,0.2269,0.2283


In [6]:
t_lgb = tune_model(lgb, optimize='F1', search_library='optuna', early_stopping='Median', choose_better=True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6227,0.6721,0.6717,0.6115,0.6402,0.2456,0.2468
1,0.624,0.6728,0.6727,0.6127,0.6413,0.248,0.2492
2,0.6221,0.6709,0.674,0.6103,0.6406,0.2443,0.2456
3,0.62,0.6693,0.6698,0.6087,0.6378,0.24,0.2412
4,0.6241,0.6756,0.6769,0.6119,0.6428,0.2483,0.2498
5,0.6192,0.666,0.6722,0.6074,0.6382,0.2384,0.2398
6,0.6246,0.6708,0.6755,0.6128,0.6426,0.2493,0.2507
7,0.6225,0.671,0.676,0.6103,0.6415,0.2451,0.2465
8,0.6251,0.6725,0.6745,0.6135,0.6425,0.2502,0.2514
9,0.6209,0.6715,0.6735,0.609,0.6396,0.2418,0.2431


In [7]:
bag_lgb = ensemble_model(t_lgb, choose_better=True, optimize='F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.624,0.6718,0.6728,0.6126,0.6413,0.248,0.2492
1,0.624,0.6728,0.671,0.613,0.6407,0.2481,0.2492
2,0.6223,0.6704,0.6732,0.6107,0.6404,0.2447,0.246
3,0.6217,0.6695,0.6707,0.6105,0.6392,0.2435,0.2447
4,0.6232,0.6758,0.6756,0.6112,0.6418,0.2465,0.2479
5,0.619,0.6655,0.6681,0.608,0.6367,0.2381,0.2393
6,0.6216,0.6696,0.6725,0.61,0.6397,0.2433,0.2446
7,0.6225,0.671,0.6727,0.611,0.6404,0.2452,0.2464
8,0.6247,0.6715,0.6739,0.6132,0.6421,0.2495,0.2507
9,0.6219,0.6721,0.6725,0.6103,0.6399,0.2438,0.2451


In [8]:
bag_lgb_2 = ensemble_model(t_lgb, choose_better=True, optimize='F1', probability_threshold=0.6)

IntProgress(value=0, description='Processing: ', max=6)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


In [None]:
save_model(t_lgb, 'tuned_lgb')
save_model(bag_lgb, 'bagged_lgb')
save_model(bag_lgb_2, 'bagged_lgb_0.6')

In [None]:
import os
os.system('shutdown -s -t 0')