In [15]:
# %rm -rf phase2_module
# !git clone https://github.com/LgDNet/phase2_module.git
# !cd phase2_module && make copy

In [57]:
from phase2_module.src.utils.dataset import Data
from phase2_module.src.models.classification.models import Model
from phase2_module.src.base import BasePiepline
# from phase2_module.src.hyper_parameters.params_optimization import xgboost_opt, set_params_optimization_data
from phase2_module.src.utils.set_seed import seed_everything
from phase2_module.src.utils.top_score_instance import check_the_score

from phase2_module.src.utils.manage_pkl_files import PickleManager as pkl_manager
from phase2_module.src import preprocess
seed_everything()  # NOTE: default 42
_model = Model()

In [58]:
import pandas as pd
import numpy as np
import re, pickle
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')
def set_up(mode= None):
    pd.set_option('display.max_columns', mode)
set_up()

# Preprocessing

In [59]:
basic = preprocess.Basic
country = preprocess.Country
customer = preprocess.Customer
inquiry = preprocess.Inquiry
product_category = preprocess.ProductCategory
convert_ratio = preprocess.Convert_ratio

# Main

In [60]:
pkls = pkl_manager.map(pkl_manager.loads, pkl_manager.metadata_directory)

runtime_instance = {
    "Train": Data.train,
    "Test": Data.test,
    "Instances":[
        {"instance": basic, "method": ['drop_duplicated'], "params": False},
        {"instance": country, "method": ['apply'], "params": pkls.get("country")},
        {"instance": inquiry, "method": ["apply"], "params": False},
        {"instance": product_category, "method": ["apply"], "params": {**pkls.get("product_category"), **pkls.get("product_subcategory")}},
        {"instance": customer, "method": ["apply"], "params": pkls.get("customer")},
        {"instance": convert_ratio, "method": ["lead_owner","customer_idx",'lead_owner_customer_idx_means'], "params": {**pkls.get("lead_owner"), **pkls.get("customer_idx")}}
    ]
}

## preprocessing

In [61]:
df_train = preprocess.runner('Train',**runtime_instance)
df_test = preprocess.runner('Test',**runtime_instance)

In [62]:
customer_idx_counts = df_train['customer_idx'].value_counts()
bins = [0, 2, 10, 100, 1000, 10000]
df_counts = pd.cut(customer_idx_counts, bins=bins, labels=False, right=False)
df_train['customer_idx_group'] = df_train['customer_idx'].map(df_counts)

customer_idx_counts = df_test['customer_idx'].value_counts()
bins = [0, 2, 10, 100, 1000, 10000]
df_counts = pd.cut(customer_idx_counts, bins=bins, labels=False, right=False)
df_test['customer_idx_group'] = df_test['customer_idx'].map(df_counts)

## encoding

In [66]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
class Encode:
    def __init__(self):
        pass
    
    def label_encoder(self,df,columns):
        if not encoding_zip['label']:
            for col in columns: # train
                encoder = LabelEncoder()
                try:
                    df.loc[:,col] = encoder.fit_transform(df[col])
                except Exception as e:
                    print(e)
                    print(col)
                # 인코더 저장
                encoding_zip['label'].append(encoder)
        else: # test
            for idx, col in enumerate(columns):
                encoder = encoding_zip['label'][idx]
                
                for label in np.sort(df[col].unique()):
                    if label not in encoder.classes_:
                        encoder.classes_ = np.append(encoder.classes_,label)
                df.loc[:,col] = encoder.transform(df[col])
        return df
    
    def convert_dataframe(self,df, onehot_arr, category_list, col_):
        col_list = np.concatenate([i.flatten() for i in category_list])    
        onehot_df = pd.DataFrame(onehot_arr, columns=[f"OneHot_{col_name}"\
                                          for col_name in col_list])
        df = pd.concat([df,onehot_df],axis = 1)
        df.drop(col_,axis = 1, inplace = True)
        return df
    
    def onehot_encoder(self,df,columns):
        if not encoding_zip['onehot']:
            encoder = OneHotEncoder(sparse=False)
            try:
                onehot_arr = encoder.fit_transform(df[columns])
                df = self.convert_dataframe(df,onehot_arr,encoder.categories_,columns)
            except Exception as e:
                print(e, col)
            # 인코더 저장
            encoding_zip['onehot'].append(encoder)
        else:
            encoder = encoding_zip['onehot'][0]
            onehot_arr = encoder.transform(df[columns])
            df = self.convert_dataframe(df,onehot_arr,encoder.categories_,columns)
        return df

In [67]:
encode = Encode()
encoding_zip = {"label":[], "onehot":[]}

In [68]:
exisiting_columns = [
                     'business_area','business_subarea','business_unit',
                        'enterprise',
#                         'response_corporate',"customer_country", # country
#                         'customer_type','customer_job','customer_position', #customer
#                         'inquiry_type','expected_timeline', # inquiry
#                         'product_subcategory','product_category','product_modelname' # product_category
                       ] 
preprocess_columns = [
                     'country','continent', # country
                     'customer_type2','job_function','seniority_level', # customer
                     'new_expected_timeline', 'new_inquiry_type',# inquiry
                     'category_1','category_2','category_3', # product_category
#                      'test2'
                     ]

In [69]:
label_encode_columns = exisiting_columns + preprocess_columns

In [70]:
# 결측치 채우기
def fillna(df):
    for col in df.columns:
        if df[col].isna().sum():
            if df[col].dtypes =='O':
                df[col].fillna('Space', inplace = True)
            else:
                df[col].fillna(0,inplace = True)
    return df
df_train = fillna(df_train)
df_test = fillna(df_test)

In [71]:
# one-hot encoding
# onehot_encode_columns = ['new_inquiry_type']
# df_train = encode.onehot_encoder(df_train,onehot_encode_columns)
# df_test = encode.onehot_encoder(df_test,onehot_encode_columns)

NameError: name 'onehot_encode_columns' is not defined

In [None]:
# label encoding
df_train = encode.label_encoder(df_train,label_encode_columns)
df_test = encode.label_encoder(df_test,label_encode_columns)

# Modeling

## drop column
모델링 전이 column을 drop하면서 다루기가 쉽다고 판단하였음

In [14]:
drop_col = [
            'id_strategic_ver','it_strategic_ver',
            'customer_country.1',
    
            'response_corporate',"customer_country", # country
            'customer_type','customer_job','customer_position', # customer
            'inquiry_type', 'expected_timeline', #inquiry
            'product_subcategory','product_category','product_modelname', # product_category
#             'cate_is_nan', # product_category
    
#             'com_reg_ver_win_rate','idit_strategic_ver','ver_cus',
#             'ver_pro','ver_win_rate_x','ver_win_ratio_per_bu',
            'customer_idx',
            'customer_idx_converted_ratio', # ratio
            'idit_strategic_ver','ver_pro', # test
            ]
df_train.drop(drop_col,axis = 1, inplace = True)

df_test.drop(drop_col,axis = 1, inplace = True)

## Scaler

In [15]:
import numpy as np
# log 변환
df_train['lead_desc_length'] = np.log1p(df_train['lead_desc_length'])
df_test['lead_desc_length'] = np.log1p(df_test['lead_desc_length'])

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']] = scaler.fit_transform(df_train[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']])
df_test[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']] = scaler.transform(df_test[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']])
# df_test['lead_desc_length'] = scale.transform(df_test['lead_desc_length'].values.reshape(-1, 1))

## train

In [17]:
model = _model.decision_tree

In [154]:
# dict_ = {'criterion': 'entropy', 'max_depth': 14, 'min_samples_split': 17, 'min_samples_leaf': 4, 'random_state': 42}

In [18]:
dict_ = {'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 2, 'random_state': 42}

In [19]:
model.set_params(**dict_)

In [20]:
X = df_train.drop(["is_converted"],axis = 1)
Y = df_train['is_converted']

In [21]:
from sklearn.model_selection import StratifiedKFold
stratkfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
result = {"f1":[], "precision":[], "recall":[],'roc_auc_score':[]}
train_result = {"f1":[], "precision":[], "recall":[],'roc_auc_score':[]}
models = [] # 모델을 저장할 리스트

# k-fold
for train_idx, test_idx in tqdm(stratkfold.split(X,Y)):
    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]

    # 모델 훈련
    model.fit(x_train, y_train)
    models.append(model) # 훈련된 모델을 리스트에 추가
    train_pred = model.predict(x_train)
    predict = model.predict(x_test)

    score_result = check_the_score(train_pred, y_train)
    for name, score in score_result.items():
        train_result[name].append(score)

    score_result = check_the_score(predict, y_test)
    for name, score in score_result.items():
        result[name].append(score)

print('----[K-Fold Train Score]-----')
for name, score_list in train_result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

print('----[K-Fold Validation Score]-----')
for name, score_list in result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

5it [00:01,  3.78it/s]

----[K-Fold Train Score]-----
f1 score : 0.9353 / STD: (+/- 0.0025)
precision score : 0.9107 / STD: (+/- 0.0034)
recall score : 0.9613 / STD: (+/- 0.0057)
roc_auc_score score : 0.9766 / STD: (+/- 0.0028)
----[K-Fold Validation Score]-----
f1 score : 0.8164 / STD: (+/- 0.0047)
precision score : 0.7981 / STD: (+/- 0.0030)
recall score : 0.8356 / STD: (+/- 0.0091)
roc_auc_score score : 0.9087 / STD: (+/- 0.0046)





In [None]:
# customer_idx 없을 때
----[K-Fold Train Score]-----
f1 score : 0.8873 / STD: (+/- 0.0056)
precision score : 0.8529 / STD: (+/- 0.0046)
recall score : 0.9247 / STD: (+/- 0.0096)
roc_auc_score score : 0.9557 / STD: (+/- 0.0049)
----[K-Fold Validation Score]-----
f1 score : 0.6427 / STD: (+/- 0.0162)
precision score : 0.6208 / STD: (+/- 0.0200)
recall score : 0.6665 / STD: (+/- 0.0139)
roc_auc_score score : 0.8162 / STD: (+/- 0.0076)
# 있고 범주화 할 때
----[K-Fold Train Score]-----
f1 score : 0.9353 / STD: (+/- 0.0025)
precision score : 0.9107 / STD: (+/- 0.0034)
recall score : 0.9613 / STD: (+/- 0.0057)
roc_auc_score score : 0.9766 / STD: (+/- 0.0028)
----[K-Fold Validation Score]-----
f1 score : 0.8165 / STD: (+/- 0.0042)
precision score : 0.7987 / STD: (+/- 0.0030)
recall score : 0.8351 / STD: (+/- 0.0089)
roc_auc_score score : 0.9085 / STD: (+/- 0.0044)

In [22]:
feature_importances = model.feature_importances_
feature_names = x_train.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_importance_dict = {k: v for k, v in sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)}
for feature_name, importance in sorted_importance_dict.items():
    print(f"{feature_name}: {importance}")

lead_owner_converted_ratio: 0.46197724754689223
historical_existing_cnt: 0.18956320807548532
customer_idx_group: 0.1050286291456926
lead_desc_length: 0.03618786410715335
customer_type2: 0.027376297886678242
lead_owner_customer_idx_mean: 0.023367490925963474
business_subarea: 0.020690323174166858
lead_owner: 0.015898539762671127
job_function: 0.015177007724198915
enterprise: 0.011936021766097215
seniority_level: 0.011136260497499328
country: 0.01098848356238286
category_3: 0.010007004918218222
com_reg_ver_win_rate: 0.008355111564064235
customer_interest: 0.007845818405747017
bant_submit: 0.007327869077312415
category_2: 0.0064443994749919025
new_expected_timeline: 0.005003023412746248
business_area: 0.004523153304656697
business_unit: 0.004397153554186661
ver_win_rate_x: 0.004257659003341533
continent: 0.004018362132046825
ver_win_ratio_per_bu: 0.0033383156871584717
category_1: 0.002432031694874875
ver_cus: 0.0018816615994445495
new_inquiry_type: 0.0008410619963286626


## test & submission

In [53]:
test_drop = df_test.drop(['id','is_converted'],axis = 1)

In [54]:
# pred = model.predict(test_drop)
# pred = np.where(pred == 0, False, True)

In [31]:
# 
final_predictions = np.array([model.predict(test_drop) for model in models])
final_prediction = np.round(final_predictions.mean(axis=0))
pred = np.where(final_prediction == 0, False, True)

True

In [51]:
sum(pred)

900

In [55]:
pred1 = pred  #pred1 903 | 986

In [56]:
su = 0
for i,j in zip(pred1,pred2):
    if i != j:
        su +=1
su

0

In [238]:
df_test['is_converted'] = pred
df_test.to_csv("submission.csv",index = False)