In [14]:
# %rm -rf phase2_module
# !git clone https://github.com/LgDNet/phase2_module.git
# !cd phase2_module && make copy

Cloning into 'phase2_module'...
remote: Enumerating objects: 287, done.[K
remote: Counting objects: 100% (287/287), done.[K
remote: Compressing objects: 100% (194/194), done.[K
remote: Total 287 (delta 129), reused 224 (delta 69), pack-reused 0[K
Receiving objects: 100% (287/287), 1.16 MiB | 3.97 MiB/s, done.
Resolving deltas: 100% (129/129), done.
sh ./scripts/copy_data
transfering data train, test


In [87]:
from phase2_module.src.utils.dataset import Data
from phase2_module.src.models.classification.models import Model
from phase2_module.src.base import BasePiepline
# from phase2_module.src.hyper_parameters.params_optimization import xgboost_opt, set_params_optimization_data
from phase2_module.src.utils.set_seed import seed_everything
from phase2_module.src.utils.top_score_instance import check_the_score

from phase2_module.src.utils.manage_pkl_files import PickleManager as pkl_manager
from phase2_module.src import preprocess
seed_everything()  # NOTE: default 42
_model = Model()

In [88]:
import pandas as pd
import numpy as np
import re, pickle
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')
def set_up(mode= None):
    pd.set_option('display.max_columns', mode)
set_up()

# Preprocessing

In [89]:
basic = preprocess.Basic
country = preprocess.Country
customer = preprocess.Customer
inquiry = preprocess.Inquiry
product_category = preprocess.ProductCategory

# Main

In [95]:
pkls = pkl_manager.map(pkl_manager.loads, pkl_manager.metadata_directory)

runtime_instance = {
    "Train": Data.train,
    "Test": Data.test,
    "Instances":[
        {"instance": basic, "method": ['drop_duplicated'], "params": False},
        {"instance": country, "method": ['apply'], "params": pkls.get("country")},
        {"instance": inquiry, "method": ["apply"], "params": False},
        {"instance": product_category, "method": ["apply"], "params": pkls.get("product_category")},
        {"instance": customer, "method": ["apply"], "params": pkls.get("customer")},
    ]
}

## preprocessing

In [96]:
df_train = preprocess.runner('Train',**runtime_instance)
df_test = preprocess.runner('Test',**runtime_instance)

## encoding

In [97]:
from sklearn.preprocessing import LabelEncoder
class Encode:
    def __init__(self):
        pass
    def label_encoder(self,df,columns):
        if not encoding_zip['label']:
            for col in columns: # train
                encoder = LabelEncoder()
                try:
                    df.loc[:,col] = encoder.fit_transform(df[col])
                except Exception as e:
                    print(e)
                    print(col)
                # 인코더 저장
                encoding_zip['label'].append(encoder)
        else: # test
            for idx, col in enumerate(columns):
                encoder = encoding_zip['label'][idx]
                
                for label in np.sort(df[col].unique()):
                    if label not in encoder.classes_:
                        encoder.classes_ = np.append(encoder.classes_,label)
                df.loc[:,col] = encoder.transform(df[col])
            
        return df

In [98]:
encode = Encode()
encoding_zip = {"label":[], "onehot":[]}

In [99]:
exisiting_columns = ['business_area','business_subarea','business_unit',
#                         'customer_type','customer_job','customer_position'
#                         'inquiry_type','expected_timeline',
                        'enterprise',
#                         'response_corporate',"customer_country"
#                         'product_subcategory','product_category',
                       ] 
preprocess_columns = ['category_1','category_2', # product_category
                      'new_inquiry_type','new_expected_timeline', # inquiry
                     'country','continent', # country
                     'customer_type2','job_function','seniority_level' # customer
                     ]

In [100]:
label_encode_columns = exisiting_columns + preprocess_columns

In [101]:
# 결측치 채우기
def fillna(df):
    for col in df.columns:
        if df[col].isna().sum():
            if df[col].dtypes =='O':
                df[col].fillna('ETC', inplace = True)
            else:
                df[col].fillna(0,inplace = True)
    return df
df_train = fillna(df_train)
df_test = fillna(df_test)

In [103]:
df_trian = encode.label_encoder(df_train,label_encode_columns)
df_test = encode.label_encoder(df_test,label_encode_columns)

# Modeling

## drop column
모델링 전이 column을 drop하면서 다루기가 쉽다고 판단하였음

In [104]:
drop_col = ['id_strategic_ver','it_strategic_ver',
            'customer_country.1','response_corporate',"customer_country", # country
            'customer_type','customer_job','customer_position', # customer
            'inquiry_type', 'expected_timeline', #inquiry
            'product_subcategory','product_category','product_modelname', # product_category
            'category_3','cate_is_nan','mapped', # product_category
#            'historical_existing_cnt',
            ]
df_train.drop(drop_col,axis = 1, inplace = True)

df_test.drop(drop_col,axis = 1, inplace = True)

## train

In [106]:
model = _model.decision_tree

In [107]:
X = df_train.drop(["is_converted"],axis = 1)
Y = df_train['is_converted']

In [111]:
X.reset_index(drop = True, inplace = True)
Y.reset_index(drop = True, inplace = True)

In [112]:
from sklearn.model_selection import StratifiedKFold
stratkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
result = {"f1":[], "precision":[], "recall":[]}
train_result = {"f1":[], "precision":[], "recall":[]}

# k-fold
for train_idx, test_idx in tqdm(stratkfold.split(X,Y)):
    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]

    # 모델 훈련
    model.fit(x_train, y_train)
    
    train_pred = model.predict(x_train)
    predict = model.predict(x_test)
    
    score_result = check_the_score(train_pred, y_train)
    for name, score in score_result.items():
        train_result[name].append(score)
        
    score_result = check_the_score(predict, y_test)
    for name, score in score_result.items():
        result[name].append(score)
        
print('----[K-Fold Train Score]-----')
for name, score_list in train_result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')
    
print('----[K-Fold Validation Score]-----')
for name, score_list in result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

5it [00:02,  1.71it/s]

----[K-Fold Train Score]-----
f1 score : 0.9979 / STD: (+/- 0.0003)
precision score : 0.9961 / STD: (+/- 0.0007)
recall score : 0.9997 / STD: (+/- 0.0003)
----[K-Fold Validation Score]-----
f1 score : 0.7719 / STD: (+/- 0.0054)
precision score : 0.7818 / STD: (+/- 0.0120)
recall score : 0.7625 / STD: (+/- 0.0085)





## test & submission

In [113]:
test_drop = df_test.drop(['id','is_converted'],axis = 1)

In [114]:
pred = model.predict(test_drop)
pred = np.where(pred == 0, False, True)

In [122]:
df_test['is_converted'] = pred

In [123]:
df_test.to_csv("submission.csv",index = False)