In [1]:
import pandas as pd
from pycaret.classification import *
from time import time

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')

code_d = pd.read_csv('data/속성_D_코드.csv')
code_h = pd.read_csv('data/속성_H_코드.csv')
code_l = pd.read_csv('data/속성_L_코드.csv')

pd.set_option('display.max_columns', None)
# train['target'] = train['target'].astype('category')

# for data in [train, test]:
#     data.drop(['id','person_rn','person_prefer_f','person_prefer_g','contents_rn','contents_open_dt'], axis=1, inplace=True)

In [2]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h","attribute_h_p","attribute_h_l"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

In [3]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df,df_code,how="left",on=col)

In [4]:
def preprocess_data(
                    df:pd.DataFrame,is_train:bool = True, cols_merge:List[Tuple[str,pd.DataFrame]] = []  , cols_equi:List[Tuple[str,str]]= [] ,
                    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
                    )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df,df_code,col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

In [5]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

In [6]:
x_train, y_train = preprocess_data(train, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train['target'] = y_train
del train, test

x_train.shape , y_train.shape , x_test.shape

((501951, 69), (501951,), (46404, 68))

In [7]:
cat_features = x_train.columns[x_train.nunique() <= 10].tolist()
cat_features.remove('target')
num_features = [i for i in x_train.columns if i not in cat_features+['target']]

In [8]:
cell_start_time = time()
param_dict = {
    'data': x_train,
    'target': 'target',
    'train_size': 0.80,
    'categorical_features': cat_features,
    'numeric_features': num_features,
    'data_split_shuffle': True,
    'fold': 5,
    'fold_strategy': 'kfold',
    'fold_shuffle': True,
    'n_jobs': -1,
    'verbose': True,
    'silent': True,
    'session_id': 42
}

exp = setup(**param_dict)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(501951, 69)"
5,Missing Values,False
6,Numeric Features,40
7,Categorical Features,28
8,Ordinal Features,False
9,High Cardinality Features,False


CELL RUN TIME :  61.2105598449707


In [9]:
cat = create_model('catboost')
save_model(cat, 'cat')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6205,0.6698,0.6607,0.6114,0.6351,0.241,0.2418
1,0.6205,0.6689,0.6602,0.6106,0.6344,0.2412,0.242
2,0.6182,0.6662,0.6558,0.6101,0.6321,0.2364,0.2371
3,0.62,0.6692,0.6587,0.6108,0.6338,0.2401,0.2409
4,0.6232,0.6696,0.6622,0.6141,0.6373,0.2463,0.2471
Mean,0.6205,0.6688,0.6595,0.6114,0.6345,0.241,0.2418
SD,0.0016,0.0013,0.0022,0.0014,0.0017,0.0032,0.0032


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['d_l_match_yn',
                                                             'd_m_match_yn',
                                                             'd_s_match_yn',
                                                             'h_l_match_yn',
                                                             'h_m_match_yn',
                                                             'h_s_match_yn',
                                                             'person_attribute_a',
                                                             'person_attribute_a_1',
                                                             'person_attribute_b',
                                                             'person_prefer_c',
                                                             'contents_attribute_i',
                                                             'contents_

In [None]:
t_cat = tune_model(cat, optimize='F1', search_library='optuna', early_stopping='Median', choose_better=True)
save_model(t_cat, 'tuned_cat')

In [None]:
bag_cat = ensemble_model(t_cat, choose_better=True, optimize='F1')
save_model(bag_cat, 'bagged_cat')

In [None]:
bag_cat_2 = ensemble_model(t_cat, choose_better=True, optimize='F1', probability_threshold=0.6)
save_model(bag_cat_2, 'bagged_cat_0.6')