In [107]:
# %rm -rf phase2_module
# !git clone https://github.com/LgDNet/phase2_module.git
# !cd phase2_module && make copy

In [134]:
from phase2_module.data.load import Data
from phase2_module.src.models.classification.models import Model
from phase2_module.src.base import BasePiepline
# from phase2_module.src.hyper_parameters.params_optimization import xgboost_opt, set_params_optimization_data
from phase2_module.src.utils.set_seed import seed_everything
from phase2_module.src.utils.top_score_instance import check_the_score
seed_everything()  # NOTE: default 42
_model = Model()

In [135]:
import pandas as pd
import numpy as np
import re, pickle
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')

# Preprocessing

In [136]:
class Basic:
    def __init__(self):
        pass
    
    def drop_columns(self,df):
        pass
    
    def drop_duplicated(self,df):
        df = df.drop_duplicates()
        df.reset_index(drop = True, inplace= True)
        return df
    
    def _all(self,df, module_list):
        if not module_list:
            raise ValueError("Not used modules")
        
        if not isinstance(module_list, list):
            module_list = [module_list]
        
        for module in module_list:
            method = getattr(self, module)

            df = method(df)
            
        return df

In [137]:
class Inquiry:
    def __init__(self):
        self.inquiry_type = {}

        consulation = {
            "Quotation": "Quotation or Purchase Consultation",  # 견적 또는 구매 상담
            "Sales": "Quotation or Purchase Consultation",
            "Product": "Quotation or Purchase Consultation",
            "Purchase": "Quotation or Purchase Consultation",
            "Event": "Quotation or Purchase Consultation",
            "Partner": "Quotation or Purchase Consultation",
        }

        demo = {
            "Demo": "Request a Demo",  # 데모 요청하기
        }

        oem_odm = {
            "Oem": "OEM/ODM Request",
            "Odm": "OEM/ODM Request",
        }

        technic = {
            "Usage": "Usage or Technical Consultation",  # 사용 또는 기술 상담
            "Technical": "Usage or Technical Consultation",
            "Trainings": "Usage or Technical Consultation",
            "Services": "Usage or Technical Consultation",
            "Suggestions": "Usage or Technical Consultation",
        }


        distributorship = {
            "Distributorship": "Request for Distributorship",  # 대리점에 요청
        }


        others = {
            "Other": "Other",
            "Etc": "Other",
        }

        self.inquiry_category = [consulation, demo, oem_odm, technic, distributorship, others]

        for category in self.inquiry_category:
            self.inquiry_type.update(category)
            
        self.start_patterns = [re.compile(f"(?i)^{i}") for i in self.inquiry_type.keys()]
        self.exists_patterns = [re.compile(f"(?i){i}") for i in self.inquiry_type.keys()]
        
        self.expected_timeline_dict = {
            "Follow": "Follow up",
            "Already": "Follow up",
            "Respond": "No Response",
            "Response": "No Response",
            "Required": "No Response",
            "Requirement": "No Response",
            "Budget": "Budget Issue",
            "Interest": "Not Interest",
        }
        self.expected_timeline_exists_patterns = [re.compile(f"(?i){i}") for i in self.expected_timeline_dict.keys()]
        
        self.categories = ['Less than 3 Months', '3 months ~ 6 months', 'more than a year', '9 months ~ 1 year', '6 months ~ 9 months', 'Follow up', '3 Months ~ 6 Months', 'No Response', 'Budget Issue', 'Not Interest', "Space"]
        
        
    def fill(self,df):
        df["inquiry_type"].fillna("-", inplace=True)
        return df
        

    def new_inquiry_type(self, old_inquiry_type):
        for pattern in self.start_patterns:
            find = pattern.search(old_inquiry_type.strip())
            if find:
                return self.inquiry_type.get(find.group().capitalize())
        return "Other"


    def retry_unknown_value_mapping(self, new_inquiry_type, old_inquiry_type):
        if new_inquiry_type == "Other":
            for pattern in self.exists_patterns:
                find = pattern.search(old_inquiry_type.strip())
                if find:
                    return self.inquiry_type.get(find.group().capitalize())
            return "-"

        return new_inquiry_type
    
    def convert_timeline_in_tilda(self, timeline):
        if "~" in timeline:
            timeline = timeline.replace("_", " ")
            split_timeline = timeline.split("~")
            return split_timeline[0].strip() + " ~ " + split_timeline[1].strip()
        return timeline

    def less_value_categorial(self,timeline):
        find = None
        numeric_value = None
        scope = {
            "3": "Less than 3 Months",
            "6": "3 Months ~ 6 Months",
            "9": "6 Months ~ 9 Months",
        }

        numeric_pattern = re.compile("\d")  # NOTE: less 다음에 오는 숫자 데이터 추출
        if "less" in timeline:
            less_index = timeline.index("less")
            find = re.search(r'\d{1,2}', timeline[less_index:])
        if find:
            numeric_value = find.group()

            for k,v in scope.items():
                if int(numeric_value) <= int(k):
                    return v
        return timeline

    def more_value_categorial(self,timeline):
        find = None
        numeric_value = None
        scope = {
            "9": "More than a year",
            "6": "6 Months ~ 9 Months",
            "3": "3 Months ~ 6 Months",
            "0": "Less than 3 Months",
        }

        numeric_pattern = re.compile("\d")  # NOTE: less 문구의 숫자 데이터 추출
        if "more" in timeline:
            more_index = timeline.index("more")
            find = re.search(r'\d{1,2}', timeline[more_index:])


        if find:
            numeric_value = find.group()

            for k,v in scope.items():
                if int(numeric_value) >= int(k):
                    return v
        return timeline.replace("_", " ")
    
    def timeline_retry_unknown_value_mapping(self, new_expected_timeline):
        for pattern in self.expected_timeline_exists_patterns:
            find = pattern.search(new_expected_timeline.strip())
            if find:
                return self.expected_timeline_dict.get(find.group().capitalize())

        return new_expected_timeline
    

    def _all(self,df, module_list = None):
        df = self.fill(df)
        df["new_inquiry_type"] = df["inquiry_type"].apply(self.new_inquiry_type)
        df["new_inquiry_type"] = df.apply(lambda row: self.retry_unknown_value_mapping(row["new_inquiry_type"], row["inquiry_type"]), axis=1)

        df.loc[df["new_inquiry_type"] == "-", "new_inquiry_type"] = "Other"
        df["expected_timeline"].fillna("Space", inplace=True)

        df["new_expected_timeline"] = df["expected_timeline"].apply(self.convert_timeline_in_tilda)

        df["new_expected_timeline"] = df["new_expected_timeline"].apply(self.less_value_categorial)
        df["new_expected_timeline"] = df["new_expected_timeline"].apply(self.more_value_categorial)

        df["new_expected_timeline"] = df["new_expected_timeline"].apply(self.timeline_retry_unknown_value_mapping)


        df["new_expected_timeline"] = df['new_expected_timeline'].where(df['new_expected_timeline'].isin(self.categories), 'Unknown')
        
        return df

In [138]:
class Country:
    def __init__(self, pkl):
        self.area = {'anguilla':'united kingdom',
                'br':'none',
                'a':'none',
                'nd':'none',
                'ny':'none',
                'rj':'none',
                'us':'united states',
                'ca':'none',
                'kerela':'india',
                'pune':'india',
                'colombia - cartagena':'colombia',
                'country':'none'}
        self.pkl = pkl
        
    def country(self,df):
        """ 나라 컬럼 생성"""
        
        # 담당 자사 법인 맵핑
        df['response_corporate2'] = df['response_corporate'].map(self.pkl)

        df['customer_country'] = df['customer_country'].str.lower().str.strip()
        df['response_corporate2'] = df['response_corporate2'].str.lower().str.strip()

        # 정규표현식으로 나라만 거르기
        df.loc[:,"country"] = df["customer_country"].str.extract(r'/([^/]+)$')[0]
        df['country'] = df['country'].str.strip()

        # 숫자 포함 -> none으로 변경 및 결측치 채우기
        df['country'] = df['country'].apply(lambda x: 'none' if re.search(r'\d', str(x)) else x)
        df['country'].fillna('none',axis = 0, inplace = True)

        # 이상한 수치 채우기
        for i,j in self.area.items():
            idx = df[df['country']==i].index
            if len(idx):
                df.loc[idx,'country'] = j
        # none 결측치 채우기
        cond = df['country'] =='none'
        df.loc[cond,'country'] = df[cond]['response_corporate2']
        return df

    def city(self, df):
        # 도시만 뽑기
        df['city'] = df['customer_country'].str.split('/').str[-2]
        df['city'] = df['city'].str.strip()

        # others 처리
        cond = df['city'] ==''
        df.loc[cond,'city'] = 'others'

        # 결측치 채우기
        df['city'].fillna('others',axis = 0, inplace = True)

        return df

    def _all(self,df, module_list):
        if not module_list:
            raise ValueError("Not used modules")
        
        if not isinstance(module_list, list):
            module_list = [module_list]
        
        for module in module_list:
            method = getattr(self, module)

            df = method(df)
            
        return df

In [139]:
class ProductCategory:
    def __init__(self):
        self.replacement_dict = {
            'interactive signage': 'interactive digital board',    'education createboard': 'interactive digital board',
            '互動式顯示屏': 'interactive digital board',    'multi inverter': 'multi-split',
            'multi split' : 'multi-split',    'video wall': 'video wall signage',
            'videowall_rmk': 'video wall signage',    '43us660h0sd.awz': 'hotel tv',   'residential air conditioner': 'rac',
            'aire acondicionado residencial': 'rac',    'ar condicionado residencial' : 'rac',
            'all lg vrf systems': 'vrf',    'ur640': 'ur640s',    'idb': 'interactive digital board',
            'id': 'interactive digital board',    'others': 'etc.',
            'other': 'etc.',    'lainnya': 'etc.',
            'otros': 'etc.',    'sales inquiry': 'etc.',    'commercial tv,tv': 'commercial tv',
            'monitor signage,commercial tv': 'commercial tv',    'onequick series': 'one:quick',
            'lg one:quick': 'one:quick',    'led 顯示屏': 'led signage',
            '32lq621cbsb.awz': 'smart tv signage',    'monitor signage,commercial tv,monior/monitor tv': 'monitor signage,monior/monitor tv',
            'monior/monitor tv,tv': 'monitor signage,monior/monitor tv',    'monitor signage,tv': 'monitor signage,monior/monitor tv',
            'monior/monitor tv,pc': 'monitor signage,monior/monitor tv',    'monitor signage,pc': 'monitor signage,monior/monitor tv',
            'isıtma': 'heating',    'calefacción': 'heating',    'ogrzewanie (pompy ciepła)' : 'heating',
            'standalone': 'commercial tv',    'system ac': 'single-split',  
            'điều hòa trung tâm multi': 'single-split',
            'sac' : 'single-split',    'fhd series': 'standard signage',
            'tv signage': 'standard signage',    'oled 顯示屏': 'oled signage',
            'מזגנים למקום מגורים': 'rac',    'htv': 'commercial tv',
            'led': 'led signage',   
            'vrf,multi-split': 'multi-split',   
            'soğutucu': 'single-split',    '標準顯示屏': 'led signage',    'monitor signage,monior/monitor tv' : 'monitor',
            'teto ou cassete inverter' : 'single-split',    'one quick:flex' : 'one:quick',
            'تكييف وتبريد' : 'single-split',    'comercial tv' : 'commercial tv',    'تكييفات' : 'single-split',    'مبرد (تشيلر)' : 'single-split',
            'חימום' : 'heating',    'ฯลฯ' : 'etc.',    'vb.' : 'etc.',    'videwall' : 'video wall signage',    'videowall signage' : 'video wall signage',
            'hospitality' : 'hotel tv',    'signage' : 'etc.',    'aquecimento' : 'heating',    'laec015' : 'one:quick',    'climatiseur résidentiel' : 'single-split',
            'khác' : 'etc.',    '醫院電視' : 'hospital tv',    '酒店電視' : 'hotel tv',    'เครื่องปรับอากาศเผื่อที่อยู่อาศัย' : 'single-split',
            'điều hòa gia dụng' : 'single-split',    'ac rumah' : 'single-split',    'điều hòa cục bộ' : 'single-split',
            'آخر' : 'etc.',    'bu50nst' : 'projector',    'window facing display' : 'outdoor_led',    '軟體' : 'software solution',
            'حلول التدفئة' : 'heating',    '高亮度顯示屏' : 'curvable_oled',    '特別顯示屏' : 'special signage',
            'פיצול מרובה' : 'multi-split',    'אחר' : 'etc.',    'split tunggal' : 'single-split',
            'radiology displays' : 'medical display',    'inne' : 'inne'
        }

        # 필터 안에 순서가 매우 중요.
        self.filter1 = {
            'interactive' : 'interactive digital board',
            'vrf' : 'vrf','multi-split' : 'multi-split', 'single' : 'single-split', 'air conditioner' : 'single-split',
            'video wall' : 'video wall signage', 'chiller' : 'chiller',    'hotel tv' : 'hospitality_tv', 'pro:centric' : 'hospitality_tv',
            'hospital tv' : 'hospitality_tv',    'smart tv signage' : 'hospitality_tv',    'smart' : 'hospitality_tv',    'standard' : 'standard signage',
            'outros' : 'heating', 'brightness' : 'high brightness signage',
            'one:quick' : 'one:quick',    
            'one quick' : 'one:quick',    
            'medical' : 'medical display',    
            'system ac' : 'single-split',
            'commercial tv' : 'commercial tv',    
            'multi v' : 'multi v',    'monitor' : 'monitor',    'projector' : 'projector',
            'centric' : 'pro:centric',    'cloud' : 'cloud device',    'ess' : 'ess',    'energy' : 'ess',    'robot' : 'robot',
            'pc' : 'pc',    'tv' : 'tv',    'air' : 'single-split',    'ur640s' : 'standard signage',    'autre' : 'etc.',
            '.bwz' : 'hospitality_tv',    'laec' : 'ledallinone',    'vm5' : 'video wall signage',    'vl5' : 'video wall signage',
            'svh7' : 'video wall signage',    '28mq780' : 'ultrawide monitor',    'gsc' : 'outdoor_led',    'out_door' : 'outdoor_led',
            'care solution' : 'signage care solution',
            'system ac' : 'single-split', 'rac' : 'single-split',
            'magnit' : 'led signage',    '49xf' : 'high brightness signage',    'stretch' : 'high brightness signage',    ' pol' : 'hospitality_tv',
            'us660' : 'hospitality_tv',    'hoteleria' : 'hospitality_tv',    'single' : 'single-split',    'lsca' : 'indoor led',    '55tc3d' : 'interactive digital board',
            'uh' : 'standard signage',    'washing' : 'washer',    '110' : 'standard signage', 'etc' : 'others',
        }

        self.cate_num_dict = {
            0 : 'commercial_display',    1 : 'hvac',    2 : 'it_products',    3 : 'commercial_laundry',    4 : 'robot',  5: 'others', -1 : 'non_serviced',
        }


        self.cate_dict  = {
            'interactive digital board' : 0, 'vrf' : 1, 'multi-split' : 1, 'hospitality_tv' : 0,
            'video wall signage': 0, 'led signage' : 0, 'single-split' : 1,
            'oled signage' : 0, 'chiller' : 1, 'standard signage' : 0,
            'medical display' : 2,    'one:quick' : 0,    'heating' : 1,    'high brightness signage' : 0,
            'ventilation' : 0,    'control' : -1,    'aircare' : -1,    'software solution' : 0,
            'special signage' : 0, 'webos' : 0,    'pc' : 2, 'projector' : 2, 'commercial display' : 0, 'outros' : 1,
            'signage care solution' : 0,    'multi v' : 1, 'cloud device' : 2,
            'medical displays' : 2,    'laptop' : 2,    'a thermodynamic water heater' : 1,
            'monitor' : 2,    'virtual production' :  0,    'ogrzewanie (pompy ciepła)' : 1,
            'commercial tv' : 0,    'digital signage' : 0,    'ess': 1,    'ledallinone' : 0,    'ultrawide monitor' : 2,    'tv' : 0,
            'washing machine' : 3,    'dryer' : 3,    'aircare' : -1,    'robots' : 4,    'error' : -1,
            '' : -1,    'signage' : 0,    'transparent oled' : 0,    'ultra stretch' : 0,
            'outdoor_led' : 0,    'lcd signage' : 0,    'curvable_oled' : 0,    'indoor led' : 0,
            'others' : 5,
        }

        self.subcate_dict = {
            'interactive digital board' : 'digital_signage','standard signage' : 'digital_signage', 'one:quick' : 'digital_signage',     'special signage' : 'digital_signage',
            'multi-split' : 'commercial_solutions', 'a thermodynamic water heater' : 'residential_solutions', 'heating': 'residential_solutions',  
            'video wall signage' : 'digital_signage','high brightness signage' : 'digital_signage',
            'led signage' : 'led_signage', 'indoor led' : 'led_signage',     'ledallinone' : 'led_signage', 'virtual production' : 'led_signage',
            'vrf' : 'commercial_solutions',  'single-split' : 'commercial_solutions', 'multi v' : 'commercial_solutions', 
            'ventilation' : 'commercial_solutions', 'chiller' : 'commercial_solutions',
            'oled signage' : 'oled_signage',
            'medical display' : 'medical_display',   
            'control' : 'non_serviced',
            'software solution' : 'software', 
            'hospitality_tv' : 'commercial_tv', 
            'webos' : 'software',    'projector' : 'projector',
            'outros' : 'residential_solutions',  
            'signage care solution' : 'software',
            'cloud device' : 'cloud_device',
            'medical displays' : 'medical displays',
            'laptop' : 'laptop',

            'monitor' : 'monitor',
            'commercial tv' : 'commercial_tv', 'tv' : 'commercial_tv',
            'digital signage' : 'digital_signage',
            'ess' : 'ess',
            'ultrawide monitor' : 'monitor',
            'washing machine' : 'washer',
            'dryer' : 'dryer',  
            'transparent oled' : 'oled_signage',
            'ultra stretch' : 'digital_signage',   
            'outdoor_led' : 'led_signage',
            'lcd_signage' : 'digital_signage',   
            'curvable_oled' : 'oled_sigange',  

        }

        self.subsubcate_dict = {
            'interactive digital board' : 'interactive',    'video wall signage' : 'video_wall',
            'single split' : 'single_split',    'chiller' : 'chiller',
            'standard signage' : 'standard',    'one:quick' : 'one:quick',
            'ventilation' : 'ventilation_solution(erv)',    'control' : 'non_serviced',
            'software solution' : 'supersign_software',    'special signage' : 'special',
            'hospitality_tv' : 'hospitality_tv',
            'webos' : 'webos',
            'signage care solution' : 'signage_care_solution',  
            'virtual production' : 'virtual_production',    'multi v' : 'vrf_system',
            'vrf' : 'vrf_system',    'outros' : 'air_to_water_heat_pumps',
            'laptop' : 'gram',    'ogrzewanie (pompy ciepła)' : 'air_to_water_heat_pumps',
            'ledallinone' : 'all_in_one',    'ultrawide monitor' : 'ultrawide',
            'heating': 'air_to_water_heat_pumps',    'a thermodynamic water heater' : 'water_heater',
            'transparent oled' : 'transparent_oled',    'ultra stretch' : 'ultra stretch',
            'outdoor_led' : 'outdoor_led',    'lcd_signage' : 'lcd_signage',
            'curvable_oled' : 'curvable_oled',    'high brightness signage' : 'high_brightness_signage',
            'indoor led' : 'indoor_led', 'multi-split' : 'multi_split', 'monitor' : 'monitor'
        }
    
    def label_rows(self,df):
        if df['product_modelname'] is np.nan and df['product_subcategory'] is np.nan and df['product_category'] is np.nan:
            return 8
        elif df['product_modelname'] is np.nan and df['product_subcategory'] is np.nan:
            return 7
        elif df['product_modelname'] is np.nan and df['product_category'] is np.nan:
            return 6
        elif df['product_subcategory'] is np.nan and df['product_category'] is np.nan:
            return 5
        elif df['product_modelname'] is np.nan:
            return 4
        elif df['product_subcategory'] is np.nan:
            return 3
        elif df['product_category'] is np.nan:
            return 2
        else:
            return 1
        
    def _all(self,df, module_list: list):
        df['customer_interest'] = df.apply(lambda row: self.label_rows(row), axis=1)
        df[['product_modelname', 'product_subcategory', 'product_category']] = df[['product_modelname', 'product_subcategory', 'product_category']].fillna('Unknown') # 그 후 널값 채우기
        
        df['product_category'] = df['product_category'].str.lower().str.strip()
        
        # washing machine 추가
        mask = df['product_category'] == 'commercial tv,projector'
        copy_df = df[mask].copy()
        copy_df['product_category'] = 'projector'
        # 원본 데이터 프레임에 데이터 추가 
        df = pd.concat([df, copy_df])

        # dryer 추가
        mask = df['product_category'] == 'commercial tv,projector'
        copy_df = df[mask].copy()
        copy_df['product_category'] = 'commercial tv'
        df = pd.concat([df, copy_df])

        # 원본데이터 삭제.
        df = df[df['product_category'] != 'commercial tv,projector']
        
        # washing machine 추가
        mask = df['product_category'] == 'washing machine,dryer'
        copy_df = df[mask].copy()
        copy_df['product_category'] = 'washing machine'
        # 원본 데이터 프레임에 데이터 추가 
        df = pd.concat([df, copy_df])

        # dryer 추가
        mask = df['product_category'] == 'washing machine,dryer'
        copy_df = df[mask].copy()
        copy_df['product_category'] = 'dryer'
        df = pd.concat([df, copy_df])

        # 원본데이터 삭제.
        df = df[df['product_category'] != 'washing machine,dryer']
        
        
        
        df['product_category'] = df['product_category'].replace(self.replacement_dict).str.replace('solar,', '')
        
        df['mapped'] = df['product_category'].apply(lambda x: next((v for k, v in self.filter1.items() if k in x), x))
        
        
        category_counts = df['mapped'].value_counts()
        categories_to_replace = category_counts[category_counts < 6].index.tolist() # 6개 미만 index 찾기
        # 데이터 변환
        df['product_category'] = df['mapped'].apply(lambda x: 'others' if x in categories_to_replace else x)

        df['category_1'] = df['product_category'].map(self.cate_dict)
        df['category_1'] = df['category_1'].map(self.cate_num_dict)
        df['category_2'] = df['product_category'].map(self.subcate_dict)
        df['category_3'] = df['product_category'].map(self.subsubcate_dict)
        df['cate_is_nan'] = df[['category_1', 'category_2', 'category_3']].isna().any(axis=1)
        
        return df

# Main

## preprocessing

In [140]:
with open('./phase2_module/preprocessing_gy/response_corporate.pkl','rb') as f:
    res = pickle.load(f)

In [141]:
runtime_instance = {
    "Train": Data.train,
    "Test": Data.test,
    "Instances":[    
        {"instance": Basic, "method": ['drop_duplicated'], "params": False},
        {"instance": Country, "method": ['country','city'], "params": res},
        {"instance": Inquiry, "method": ["_all"], "params": False},
        {"instance": ProductCategory, "method": ["_all"], "params": False},
]
}

def main(data,**kwargs):
    df = kwargs.get(data)
    for value in kwargs["Instances"]:
        instance = value.get("instance")
        if param := value["params"]:
            instance = instance(param)
        else:
            instance = instance()

        df = instance._all(df, value.get("method"))
    return df

In [142]:
df_train = main('Train',**runtime_instance)
df_test = main('Test',**runtime_instance)

In [None]:
# product 다 버리고 category_3도 버려
# 'mapped','product_category','product_subcategory','product_modelname',
# 'customer_country.1','category_3','cate_is_nan'

## encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
class Encode:
    def __init__(self):
        pass
    def label_encoder(self,df,columns):
        if not encoding_zip['label']:
            for col in columns: # train
                encoder = LabelEncoder()
                df.loc[:,col] = encoder.fit_transform(df[col])
                # 인코더 저장
                encoding_zip['label'].append(encoder)
        else: # test
            for idx, col in enumerate(columns):
                encoder = encoding_zip['label'][idx]
                # 변환
                df.loc[:,col] = encoder.transform(df[col])
        return df

In [None]:
encode = Encode()
encoding_zip = {"label":[], "onehot":[]}

In [None]:
# "customer_country", "country", "city"
label_encode_columns = ["business_subarea", "business_area", "business_unit",
                        "customer_type", "enterprise", "customer_job", "inquiry_type", "product_category",
                        "product_subcategory", "product_modelname", "customer_position", "response_corporate",
                        "expected_timeline", "new_inquiry_type", "new_expected_timeline"
                        ]
label_encode_columns = ["new_inquiry_type", "new_expected_timeline"]

In [105]:
df_trian = encode.label_encoder(df_train,label_encode_columns)
df_test = encode.label_encoder(df_test,label_encode_columns)

ValueError: y contains previously unseen labels: 2

In [106]:
Data.train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.00,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.00,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.00,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.00,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,/Sląskie/Poland,AS,,33747,End Customer,SMB,,,,...,LGEPL,3 months ~ 6 months,0,0,0.000026,0.028777,public facility,Others,694,False
59295,0.75,/Bogotá DC /Colombia,AS,0.040000,35420,Specifier/ Influencer,Enterprise,,,,...,LGECB,9 months ~ 1 year,0,0,0.000026,0.028777,public facility,,39,False
59296,0.75,/Pisco/Peru,AS,0.040000,19249,Specifier/ Influencer,Enterprise,,,,...,LGEPR,less than 3 months,0,0,0.000026,0.028777,public facility,,125,False
59297,1.00,/santa cruz bolivia/Peru,AS,0.040000,40327,,Enterprise,,,,...,LGEPR,more than a year,0,0,0.000026,0.028777,public facility,,134,False


# Modeling

## drop column
모델링 전이 column을 drop하면서 다루기가 쉽다고 판단하였음

In [98]:
drop_columns=['com_reg_ver_win_rate', 'id_strategic_ver',
              'idit_strategic_ver', 'it_strategic_ver',
              'ver_cus', 'ver_pro', 'ver_win_rate_x',
              'customer_country.1', 'ver_win_ratio_per_bu'] # 기존 뺄 컬럼
drop_new = ['inquiry_type','expected_timeline']#,,'customer_country'] # 추가할 컬럼
for i in drop_new:
    drop_columns.append(i)


df_train = df_train.drop(columns=drop_columns)
df_test=df_test.drop(columns=drop_columns)

# 나머지 결측치 채우기
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [15]:
df_train = df_train[['customer_idx','lead_owner','response_corporate','lead_desc_length','customer_country','customer_type','business_area','product_category','business_unit',"new_inquiry_type","new_expected_timeline",'customer_interest','is_converted']]
df_test = df_test[['id','customer_idx','lead_owner','response_corporate','lead_desc_length','customer_country','customer_type','business_area','product_category','business_unit',"new_inquiry_type","new_expected_timeline",'customer_interest','is_converted']]

## train

In [16]:
model = _model.xgboost

In [17]:
X = df_train.drop(["is_converted"],axis = 1)
Y = df_train['is_converted']

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [19]:
from sklearn.model_selection import StratifiedKFold
stratkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
result = {"f1":[], "precision":[], "recall":[]}
train_result = {"f1":[], "precision":[], "recall":[]}

# k-fold
for train_idx, test_idx in tqdm(stratkfold.split(X,Y)):
    x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]

    # 모델 훈련
    model.fit(x_train, y_train)
    
    train_pred = model.predict(x_train)
    predict = model.predict(x_test)
    
    score_result = check_the_score(train_pred, y_train)
    for name, score in score_result.items():
        train_result[name].append(score)
        
    score_result = check_the_score(predict, y_test)
    for name, score in score_result.items():
        result[name].append(score)
        
print('----[K-Fold Train Score]-----')
for name, score_list in train_result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')
    
print('----[K-Fold Validation Score]-----')
for name, score_list in result.items():
    print(f'{name} score : {np.mean(score_list):.4f} / STD: (+/- {np.std(score_list):.4f})')

5it [00:01,  2.57it/s]

----[K-Fold Train Score]-----
f1 score : 0.9272 / STD: (+/- 0.0031)
precision score : 0.8832 / STD: (+/- 0.0052)
recall score : 0.9759 / STD: (+/- 0.0025)
----[K-Fold Validation Score]-----
f1 score : 0.8048 / STD: (+/- 0.0080)
precision score : 0.7305 / STD: (+/- 0.0111)
recall score : 0.8959 / STD: (+/- 0.0063)





## test & submission

In [41]:
s = df_test.drop(['id','is_converted'],axis = 1)

In [42]:
pred = model.predict(s)

In [46]:
df_test['is_converted'] = pred

In [371]:
df_test.to_csv("submission.csv",index = False)