In [47]:
# !git clone https://github.com/LgDNet/phase2_module.git

In [48]:
# !cd phase2_module && make copy

In [350]:
from phase2_module.data.load import Data
from phase2_module.src.models.classification.models import Model
from phase2_module.src.base import BasePiepline
# from phase2_module.src.hyper_parameters.params_optimization import xgboost_opt, set_params_optimization_data
from phase2_module.src.utils.set_seed import seed_everything
from phase2_module.src.utils.top_score_instance import check_the_score
seed_everything()  # NOTE: default 42
_model = Model()

In [351]:
import pandas as pd
import numpy as np
import re, pickle
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')

# Preprocessing

In [352]:
class Inquiry:
    def __init__(self):
        self.inquiry_type = {}

        consulation = {
            "Quotation": "Quotation or Purchase Consultation",  # 견적 또는 구매 상담
            "Sales": "Quotation or Purchase Consultation",
            "Product": "Quotation or Purchase Consultation",
            "Purchase": "Quotation or Purchase Consultation",
            "Event": "Quotation or Purchase Consultation",
            "Partner": "Quotation or Purchase Consultation",
        }

        demo = {
            "Demo": "Request a Demo",  # 데모 요청하기
        }

        oem_odm = {
            "Oem": "OEM/ODM Request",
            "Odm": "OEM/ODM Request",
        }

        technic = {
            "Usage": "Usage or Technical Consultation",  # 사용 또는 기술 상담
            "Technical": "Usage or Technical Consultation",
            "Trainings": "Usage or Technical Consultation",
            "Services": "Usage or Technical Consultation",
            "Suggestions": "Usage or Technical Consultation",
        }


        distributorship = {
            "Distributorship": "Request for Distributorship",  # 대리점에 요청
        }


        others = {
            "Other": "Other",
            "Etc": "Other",
        }

        self.inquiry_category = [consulation, demo, oem_odm, technic, distributorship, others]

        for category in self.inquiry_category:
            self.inquiry_type.update(category)
            
        self.start_patterns = [re.compile(f"(?i)^{i}") for i in self.inquiry_type.keys()]
        self.exists_patterns = [re.compile(f"(?i){i}") for i in self.inquiry_type.keys()]
        
        self.expected_timeline_dict = {
            "Follow": "Follow up",
            "Already": "Follow up",
            "Respond": "No Response",
            "Response": "No Response",
            "Required": "No Response",
            "Requirement": "No Response",
            "Budget": "Budget Issue",
            "Interest": "Not Interest",
        }
        self.expected_timeline_exists_patterns = [re.compile(f"(?i){i}") for i in self.expected_timeline_dict.keys()]
        
        self.categories = ['Less than 3 Months', '3 months ~ 6 months', 'more than a year', '9 months ~ 1 year', '6 months ~ 9 months', 'Follow up', '3 Months ~ 6 Months', 'No Response', 'Budget Issue', 'Not Interest', "Space"]
        
        
    def fill(self,df):
        df["inquiry_type"].fillna("-", inplace=True)
        return df
        

    def new_inquiry_type(self, old_inquiry_type):
        for pattern in self.start_patterns:
            find = pattern.search(old_inquiry_type.strip())
            if find:
                return self.inquiry_type.get(find.group().capitalize())
        return "Other"


    def retry_unknown_value_mapping(self, new_inquiry_type, old_inquiry_type):
        if new_inquiry_type == "Other":
            for pattern in self.exists_patterns:
                find = pattern.search(old_inquiry_type.strip())
                if find:
                    return self.inquiry_type.get(find.group().capitalize())
            return "-"

        return new_inquiry_type
    
    def convert_timeline_in_tilda(self, timeline):
        if "~" in timeline:
            timeline = timeline.replace("_", " ")
            split_timeline = timeline.split("~")
            return split_timeline[0].strip() + " ~ " + split_timeline[1].strip()
        return timeline

    def less_value_categorial(self,timeline):
        find = None
        numeric_value = None
        scope = {
            "3": "Less than 3 Months",
            "6": "3 Months ~ 6 Months",
            "9": "6 Months ~ 9 Months",
        }

        numeric_pattern = re.compile("\d")  # NOTE: less 다음에 오는 숫자 데이터 추출
        if "less" in timeline:
            less_index = timeline.index("less")
            find = re.search(r'\d{1,2}', timeline[less_index:])
        if find:
            numeric_value = find.group()

            for k,v in scope.items():
                if int(numeric_value) <= int(k):
                    return v
        return timeline

    def more_value_categorial(self,timeline):
        find = None
        numeric_value = None
        scope = {
            "9": "More than a year",
            "6": "6 Months ~ 9 Months",
            "3": "3 Months ~ 6 Months",
            "0": "Less than 3 Months",
        }

        numeric_pattern = re.compile("\d")  # NOTE: less 문구의 숫자 데이터 추출
        if "more" in timeline:
            more_index = timeline.index("more")
            find = re.search(r'\d{1,2}', timeline[more_index:])


        if find:
            numeric_value = find.group()

            for k,v in scope.items():
                if int(numeric_value) >= int(k):
                    return v
        return timeline.replace("_", " ")
    
    def timeline_retry_unknown_value_mapping(self, new_expected_timeline):
        for pattern in self.expected_timeline_exists_patterns:
            find = pattern.search(new_expected_timeline.strip())
            if find:
                return self.expected_timeline_dict.get(find.group().capitalize())

        return new_expected_timeline
    

    def _all(self,df, module_list = None):
        df = self.fill(df)
        df["new_inquiry_type"] = df["inquiry_type"].apply(self.new_inquiry_type)
        df["new_inquiry_type"] = df.apply(lambda row: self.retry_unknown_value_mapping(row["new_inquiry_type"], row["inquiry_type"]), axis=1)

        df.loc[df["new_inquiry_type"] == "-", "new_inquiry_type"] = "Other"
        df["expected_timeline"].fillna("Space", inplace=True)

        df["new_expected_timeline"] = df["expected_timeline"].apply(self.convert_timeline_in_tilda)

        df["new_expected_timeline"] = df["new_expected_timeline"].apply(self.less_value_categorial)
        df["new_expected_timeline"] = df["new_expected_timeline"].apply(self.more_value_categorial)

        df["new_expected_timeline"] = df["new_expected_timeline"].apply(self.timeline_retry_unknown_value_mapping)


        df["new_expected_timeline"] = df['new_expected_timeline'].where(df['new_expected_timeline'].isin(self.categories), 'Unknown')
        
        return df

In [353]:
class Country:
    def __init__(self, pkl):
        self.area = {'anguilla':'united kingdom',
                'br':'none',
                'a':'none',
                'nd':'none',
                'ny':'none',
                'rj':'none',
                'us':'united states',
                'ca':'none',
                'kerela':'india',
                'pune':'india',
                'colombia - cartagena':'colombia',
                'country':'none'}
        self.pkl = pkl
        
    def country(self,df):
        """ 나라 컬럼 생성"""
    
        # 담당 자사 법인 맵핑
        df['response_corporate2'] = df['response_corporate'].map(self.pkl)

        df['customer_country'] = df['customer_country'].str.lower().str.strip()
        df['response_corporate2'] = df['response_corporate2'].str.lower().str.strip()

        # 정규표현식으로 나라만 거르기
        df.loc[:,"country"] = df["customer_country"].str.extract(r'/([^/]+)$')[0]
        df['country'] = df['country'].str.strip()

        # 숫자 포함 -> none으로 변경 및 결측치 채우기
        df['country'] = df['country'].apply(lambda x: 'none' if re.search(r'\d', str(x)) else x)
        df['country'].fillna('none',axis = 0, inplace = True)

        # 이상한 수치 채우기
        for i,j in self.area.items():
            idx = df[df['country']==i].index
            if len(idx):
                df.loc[idx,'country'] = j
        # none 결측치 채우기
        cond = df['country'] =='none'
        df.loc[cond,'country'] = df[cond]['response_corporate2']
        return df

    def city(self, df):
        # 도시만 뽑기
        df['city'] = df['customer_country'].str.split('/').str[-2]
        df['city'] = df['city'].str.strip()

        # others 처리
        cond = df['city'] ==''
        df.loc[cond,'city'] = 'others'

        # 결측치 채우기
        df['city'].fillna('others',axis = 0, inplace = True)

#         df.drop(['response_corporate2','response_corporate','customer_country'], axis = 1, inplace = True)
        return df

    def _all(self,df, module_list):
        if not module_list:
            raise ValueError("Not used modules")
        
        if not isinstance(module_list, list):
            module_list = [module_list]
        
        for module in module_list:
            method = getattr(self, module)

            df = method(df)
            
        return df

In [354]:
class ProductCategory:
    def __init__(self):
        pass
    
    def label_rows(self,df):
        if df['product_modelname'] is np.nan and df['product_subcategory'] is np.nan and df['product_category'] is np.nan:
            return 8
        elif df['product_modelname'] is np.nan and df['product_subcategory'] is np.nan:
            return 7
        elif df['product_modelname'] is np.nan and df['product_category'] is np.nan:
            return 6
        elif df['product_subcategory'] is np.nan and df['product_category'] is np.nan:
            return 5
        elif df['product_modelname'] is np.nan:
            return 4
        elif df['product_subcategory'] is np.nan:
            return 3
        elif df['product_category'] is np.nan:
            return 2
        else:
            return 1
        
    def _all(self,df, module_list: list):
        df['customer_interest'] = df.apply(lambda row: self.label_rows(row), axis=1)
        return df

# Main

In [355]:
with open('./phase2_module/preprocessing_gy/response_corporate.pkl','rb') as f:
    res = pickle.load(f)

In [356]:
runtime_instance = {
    "Train": Data.train,
    "Test": Data.submission,
    "Instances":[    
        {"instance": Country, "method": ['country','city'], "params": res},
        {"instance": Inquiry, "method": ["_all"], "params": False},
        {"instance": ProductCategory, "method": ["_all"], "params": False},
]
}

def main(data,**kwargs):
    df = kwargs.get(data)
    for value in kwargs["Instances"]:
        instance = value.get("instance")
        if param := value["params"]:
            instance = instance(param)
        else:
            instance = instance()

        df = instance._all(df, value.get("method"))
    return df

In [357]:
df_train = main('Train',**runtime_instance)
df_test = main('Test',**runtime_instance)

# Modeling

In [358]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series


# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "country",
    "city",
    "new_inquiry_type",
    "new_expected_timeline"
]




df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

drop_columns=['com_reg_ver_win_rate', 'id_strategic_ver', 'idit_strategic_ver', 'it_strategic_ver', 'ver_cus', 'ver_pro', 'ver_win_rate_x','customer_country.1', 'ver_win_ratio_per_bu']
drop_new = ['inquiry_type','expected_timeline']#,,'customer_country']
for i in drop_new:
    drop_columns.append(i)


df_train = df_train.drop(columns=drop_columns)
df_test=df_test.drop(columns=drop_columns)
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [359]:
df_train = df_train[['customer_idx','lead_owner','response_corporate','lead_desc_length','customer_country','customer_type','business_area','product_category','business_unit',"new_inquiry_type","new_expected_timeline",'customer_interest','is_converted']]
df_test = df_test[['id','customer_idx','lead_owner','response_corporate','lead_desc_length','customer_country','customer_type','business_area','product_category','business_unit',"new_inquiry_type","new_expected_timeline",'customer_interest','is_converted']]

In [360]:
model = _model.decision_tree

In [361]:
X = df_train.drop(["is_converted"],axis = 1)
Y = df_train['is_converted']

In [362]:
X

Unnamed: 0,customer_idx,lead_owner,response_corporate,lead_desc_length,customer_country,customer_type,business_area,product_category,business_unit,new_inquiry_type,new_expected_timeline,customer_interest
0,32160,0,33,62,8765,10,0,180,0,2,6,7
1,23122,1,33,96,8389,10,0,180,0,2,6,7
2,1755,2,21,56,6290,10,0,238,0,2,6,7
3,4919,3,21,44,3144,10,0,318,0,2,6,7
4,17126,4,21,97,5566,29,0,180,0,2,6,7
...,...,...,...,...,...,...,...,...,...,...,...,...
59294,33747,694,34,200,9867,9,8,318,0,2,1,7
59295,35420,39,7,70,3261,29,8,81,0,1,3,7
59296,19249,125,35,34,8462,29,8,238,0,1,6,7
59297,40327,134,35,377,9431,33,8,81,0,2,11,7


In [363]:
from sklearn.model_selection import StratifiedKFold
stratkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
result = {"f1":[], "precision":[], "recall":[]}
train_result = {"f1":[], "precision":[], "recall":[]}
# k-fold
for train_idx, test_idx in tqdm(stratkfold.split(X,Y)):
    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]

    # 모델 훈련
    model.fit(x_train, y_train)
    
    train_pred = model.predict(x_train)
    predict = model.predict(x_test)
    
    score_result = check_the_score(train_pred, y_train)
    for name, score in score_result.items():
        train_result[name].append(score)
        
    score_result = check_the_score(predict, y_test)
    for name, score in score_result.items():
        result[name].append(score)
        
print('----[K-Fold Train Score]-----')
for name, score_list in train_result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')
    
print('----[K-Fold Validation Score]-----')
for name, score_list in result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

5it [00:02,  2.43it/s]

----[K-Fold Train Score]-----
f1 score : 0.9989 / STD: (+/- 0.0001)
precision score : 0.9982 / STD: (+/- 0.0001)
recall score : 0.9996 / STD: (+/- 0.0001)
----[K-Fold Validation Score]-----
f1 score : 0.8819 / STD: (+/- 0.0061)
precision score : 0.8848 / STD: (+/- 0.0081)
recall score : 0.8791 / STD: (+/- 0.0053)





In [367]:
s = df_test.drop(['id','is_converted'],axis = 1)

In [368]:
pred = model.predict(s)

In [369]:
df_test['is_converted'] = pred

In [371]:
df_test.to_csv("submission.csv",index = False)