In [None]:
# %rm -rf phase2_module
# !git clone https://github.com/LgDNet/phase2_module.git
# !cd phase2_module && make copy

In [None]:
from phase2_module.src.utils.dataset import Data
from phase2_module.src.models.classification.models import Model
from phase2_module.src.base import BasePiepline
# from phase2_module.src.hyper_parameters.params_optimization import xgboost_opt, set_params_optimization_data
from phase2_module.src.utils.set_seed import seed_everything
from phase2_module.src.utils.top_score_instance import check_the_score

from phase2_module.src.utils.manage_pkl_files import PickleManager as pkl_manager
from phase2_module.src import preprocess
seed_everything()  # NOTE: default 42
_model = Model()

In [None]:
import pandas as pd
import numpy as np
import re, pickle
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')
def set_up(mode= None):
    pd.set_option('display.max_columns', mode)
set_up()

# Preprocessing

In [None]:
basic = preprocess.Basic
country = preprocess.Country
customer = preprocess.Customer
inquiry = preprocess.Inquiry
product_category = preprocess.ProductCategory
convert_ratio = preprocess.Convert_ratio
customer2 = preprocess.Customer2

# Main

In [None]:
pkls = pkl_manager.map(pkl_manager.loads, pkl_manager.metadata_directory)

runtime_instance = {
    "Train": Data.train,
    "Test": Data.test,
    "Instances":[
        {"instance": basic, "method": ['drop_duplicated'], "params": False},
        {"instance": country, "method": ['apply'], "params": pkls.get("country")},
        {"instance": inquiry, "method": ["apply"], "params": False},
        {"instance": product_category, "method": ["product_categories", 'fill_odds'], "params": {**pkls.get("product_category"), **pkls.get("product_subcategory")}},
        {"instance": customer, "method": ["apply"], "params": pkls.get("customer")},
        {"instance": convert_ratio, "method": ["lead_owner","customer_idx",'lead_owner_customer_idx_means'], "params": {**pkls.get("lead_owner"), **pkls.get("customer_idx")}},
        {"instance": basic, "method": ['cusotmer_idx_categorization','customer_idx_merge_enterprise'], "params": False},
        {"instance": customer2, "method": ['seniority_level'], "params": pkls.get("customer2")},
    ]
}

## preprocessing

In [None]:
df_train = preprocess.runner('Train',**runtime_instance)
df_test = preprocess.runner('Test',**runtime_instance)

In [None]:
# russia & csi 변환
cond = df_train['continent']=='russia & cis'
df_train.loc[cond,'continent'] = 'asia & pacific'
# df_test에 러시아가 없음.

In [None]:
# cm 제거
cond2 = df_train[df_train['business_unit'] == 'CM']
df_train.drop(cond2.index,axis = 0, inplace = True)
df_train.reset_index(drop = True,inplace = True)

In [None]:
# # prev column도 영향이 그렇게 많지는 않다. 뺴고 historical 그대로 사용하는 방법..?

# cond = df_train['historical_existing_cnt']> 0 
# df_train.loc[cond,"prev"] = 1
# df_train['prev'] = df_train['prev'].fillna(0)

# cond = df_test['historical_existing_cnt']> 0 
# df_test.loc[cond,"prev"] = 1
# df_test['prev'] = df_test['prev'].fillna(0)

In [None]:
# # lead_owner_converted_ratio 구간화, 별로 안좋은 것 같다.

# # bin = [0.0001,10.0001,20.0001,30.0001,40.0001,50.0001,60.0001,70.0001,80.0001,90.0001,100.00,100.001]
# bin = [0.0001,10.0001,30.0001,50.0001,70.0001,90.0001,100.00,100.001]
# df_train.loc[:,'lead_owner_converted_ratio'] = pd.cut(df_train['lead_owner_converted_ratio'], bins=bin, right = False,labels= False)
# df_train['lead_owner_converted_ratio'] = df_train['lead_owner_converted_ratio']+1
# df_train['lead_owner_converted_ratio'].fillna(0.0,axis = 0, inplace  = True)
# # bin = [0.0001,10.0001,20.0001,30.0001,40.0001,50.0001,60.0001,70.0001,80.0001,90.0001,100.00,100.001]
# bin = [0.0001,10.0001,30.0001,50.0001,70.0001,90.0001,100.00,100.001]
# df_test.loc[:,'lead_owner_converted_ratio'] = pd.cut(df_test['lead_owner_converted_ratio'], bins=bin, right = False,labels= False)
# df_test['lead_owner_converted_ratio'] = df_test['lead_owner_converted_ratio']+1
# df_test['lead_owner_converted_ratio'].fillna(0.0,axis = 0, inplace  = True)

In [None]:
# less 적은 것만 1만 가중치 줌. 그냥 만든 컬럼. 꽤 의미? 아마ㅏ?
cond = df_train['new_expected_timeline']=='Less than 3 Months'#.value_counts()
df_train.loc[cond,'less_timeline'] = 1
df_train['less_timeline'].fillna(0,inplace = True)


cond = df_test['new_expected_timeline']=='Less than 3 Months'#.value_counts()
df_test.loc[cond,'less_timeline'] = 1
df_test['less_timeline'].fillna(0,inplace = True)

## encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
class Encode:
    def __init__(self):
        pass
    
    def label_encoder(self,df,columns):
        if not encoding_zip['label']:
            for col in columns: # train
                encoder = LabelEncoder()
                try:
                    df.loc[:,col] = encoder.fit_transform(df[col])
                except Exception as e:
                    print(e)
                    print(col)
                # 인코더 저장
                encoding_zip['label'].append(encoder)
        else: # test
            for idx, col in enumerate(columns):
                encoder = encoding_zip['label'][idx]
                
                for label in np.sort(df[col].unique()):
                    if label not in encoder.classes_:
                        encoder.classes_ = np.append(encoder.classes_,label)
                df.loc[:,col] = encoder.transform(df[col])
        return df
    
    def convert_dataframe(self,df, onehot_arr, category_list, col_):
        col_list = np.concatenate([i.flatten() for i in category_list])    
        onehot_df = pd.DataFrame(onehot_arr, columns=[f"OneHot_{col_name}"\
                                          for col_name in col_list])
        df = pd.concat([df,onehot_df],axis = 1)
        df.drop(col_,axis = 1, inplace = True)
        return df
    
    def onehot_encoder(self,df,columns):
        if not encoding_zip['onehot']:
            encoder = OneHotEncoder(sparse=False)
            try:
                onehot_arr = encoder.fit_transform(df[columns])
                df = self.convert_dataframe(df,onehot_arr,encoder.categories_,columns)
            except Exception as e:
                print(e, col)
            # 인코더 저장
            encoding_zip['onehot'].append(encoder)
        else:
            encoder = encoding_zip['onehot'][0]
            onehot_arr = encoder.transform(df[columns])
            df = self.convert_dataframe(df,onehot_arr,encoder.categories_,columns)
        return df

In [None]:
encode = Encode()
encoding_zip = {"label":[], "onehot":[]}

In [None]:
exisiting_columns = [
                     'business_area','business_subarea','business_unit',
                        'enterprise',
#                         'response_corporate',"customer_country", # country
#                         'customer_type','customer_job','customer_position', #customer
#                         'inquiry_type','expected_timeline', # inquiry
#                         'product_subcategory','product_category','product_modelname' # product_category
                       ] 
preprocess_columns = [
                     'country','continent', # country
                     'customer_type2','job_function','seniority_level', # customer
                     'new_expected_timeline', 'new_inquiry_type',# inquiry
                     'category_2','category_3','category_1', # product_category
#                      'test2'
                     ]

label_encode_columns = exisiting_columns + preprocess_columns

In [None]:
# 결측치 채우기
def fillna(df):
    for col in df.columns:
        if df[col].isna().sum():
            if df[col].dtypes =='O':
                df[col].fillna('Space', inplace = True)
            else:
                df[col].fillna(0,inplace = True)
    return df
df_train = fillna(df_train)
df_test = fillna(df_test)

In [None]:
# # one-hot encoding
# onehot_encode_columns = ['category_1']
# df_train = encode.onehot_encoder(df_train,onehot_encode_columns)
# df_test = encode.onehot_encoder(df_test,onehot_encode_columns)

In [None]:
# label encoding
df_train = encode.label_encoder(df_train,label_encode_columns)
df_test = encode.label_encoder(df_test,label_encode_columns)

# Modeling

## drop column
모델링 전이 column을 drop하면서 다루기가 쉽다고 판단하였음

In [None]:
drop_col = [
            'id_strategic_ver','it_strategic_ver','idit_strategic_ver','ver_pro',
            'customer_country.1',
            'response_corporate',"customer_country", # country
            'customer_type','customer_job','customer_position', # customer
            'inquiry_type', 'expected_timeline', #inquiry
            'product_subcategory','product_category','product_modelname', # product_category
#             'cate_is_nan', # product_category
    
#             'com_reg_ver_win_rate','idit_strategic_ver','ver_cus',
#             'ver_pro','ver_win_rate_x','ver_win_ratio_per_bu',
            'customer_idx',
            'customer_idx_converted_ratio', # ratio
            'new_expected_timeline'
            #'business_unit',
#             'historical_existing_cnt',
            #'lead_owner', 'lead_owner_converted_ratio', 'lead_owner_customer_idx_mean', # lead_owner
            ]
df_train.drop(drop_col,axis = 1, inplace = True)
df_test.drop(drop_col,axis = 1, inplace = True)

## Scaler

In [None]:
import numpy as np
# log 변환
df_train['lead_desc_length'] = np.log1p(df_train['lead_desc_length'])
df_test['lead_desc_length'] = np.log1p(df_test['lead_desc_length'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']] = scaler.fit_transform(df_train[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']])
df_test[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']] = scaler.transform(df_test[['lead_owner_converted_ratio','lead_owner_customer_idx_mean']])

## train

In [None]:
dict_ = {'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 2, 'random_state': 42}
model = _model.decision_tree
model.set_params(**dict_)

In [None]:
# model = _model.xgboost
# model.set_params(random_state =42)

In [None]:
X = df_train.drop(["is_converted"],axis = 1)
Y = df_train['is_converted']

In [None]:
from sklearn.model_selection import StratifiedKFold
stratkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
result = {"f1":[], "precision":[], "recall":[],'roc_auc_score':[]}
train_result = {"f1":[], "precision":[], "recall":[],'roc_auc_score':[]}
models = [] # 모델을 저장할 리스트

# k-fold
for train_idx, test_idx in tqdm(stratkfold.split(X,Y)):
    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]

    # 모델 훈련
    model.fit(x_train, y_train)
    models.append(model) # 훈련된 모델을 리스트에 추가
    train_pred = model.predict(x_train)
    predict = model.predict(x_test)

    score_result = check_the_score(train_pred, y_train)
    for name, score in score_result.items():
        train_result[name].append(score)

    score_result = check_the_score(predict, y_test)
    for name, score in score_result.items():
        result[name].append(score)

print('----[K-Fold Train Score]-----')
for name, score_list in train_result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

print('----[K-Fold Validation Score]-----')
for name, score_list in result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

In [None]:
feature_importances = model.feature_importances_
feature_names = x_train.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_importance_dict = {k: v for k, v in sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)}
for feature_name, importance in sorted_importance_dict.items():
    print(f"{feature_name}: {importance}")

## test & submission

In [None]:
test_drop = df_test.drop(['id','is_converted'],axis = 1)

In [None]:
pred = model.predict(test_drop)
pred = np.where(pred == 0, False, True)

In [None]:
# final_predictions = np.array([model.predict(test_drop) for model in models])
# final_prediction = np.round(final_predictions.mean(axis=0))
# pred = np.where(final_prediction == 0, False, True)

In [None]:
sum(pred)

In [None]:
pred2 = pred  #pred1 1000 | pred2 919 | pred3 820

In [None]:
su = 0
for i,j in zip(pred2, pred1):
    if i != j:
        su +=1
su

In [None]:
len(df_test)

In [None]:
pred1_ = pred1.reshape(-1,1)
pred2_ = pred2.reshape(-1,1)
pred3_ = pred3.reshape(-1,1)
combine = np.hstack((pred1_, pred2_, pred3_))
combine_ = np.round(combine.mean(axis=1))

In [None]:
pred = np.where(pred == 0, False, True)
df_test['is_converted'] = pred
df_test.to_csv("submission.csv",index = False)