In [49]:
import pandas as pd
import numpy as np
import random 
import re 
import os 
import warnings
warnings.filterwarnings("ignore")
import category_encoders as ce 
from IPython.core.interactiveshell import InteractiveShell

from sklearn.metrics import (    accuracy_score,    confusion_matrix,f1_score,precision_score, recall_score,roc_auc_score)
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold,cross_val_score, StratifiedShuffleSplit

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

class CFG:
    user_seed = 42
    target = 'is_converted'
    
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG.user_seed)


import imblearn
from pycaret.classification import *
from imblearn.over_sampling import RandomOverSampler



from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder

In [50]:
train = pd.read_csv("train.csv")
test = pd.read_csv("submission.csv").drop(['id','is_converted'], axis =1) # 테스트 데이터(제출파일의 데이터)

In [51]:
categorical = [feature for feature in train.columns if train[feature].dtype=='O']
numeric = [f for f in train.columns if f not in categorical and f != 'is_converted']

### Make Feature

In [52]:
nation_corp = {
    'Austria': ['LGEAG'],    'Czech Republic': ['LGECZ'],    'France': ['LGEFS'],    'Germany': ['LGEDG'],    'Greece': ['LGEHS'],    'Hungary': ['LGEMK'],    'Italy': ['LGEIS'],    'Netherlands': ['LGESC', 'LGEEH', 'LGEBN'],    'Poland': ['LGEWR', 'LGEPL', 'LGEMA'],    'Portugal': ['LGEPT','LGEBT'],
    'EUs': ['LGEEB'],    'Romania': ['LGERO'],    'Spain': ['LGEES'],    'Sweden': ['LGENO', 'LGESW'],    'United Kingdom': ['LGEUK'],      'Kazakhstan': ['LGEAK'],    'Russia': ['LGERM', 'LGERI', 'LGERA'],
    'Ukraine': ['LGEUR'],    'Latvia': ['LGELV','LGELA'],    'Algeria': ['LGEAS'],
    'Egypt': ['LGEEG'],    'Jordan': ['LGELF'],    'Kenya': ['LGESK','LGEEF'],    'Morocco': ['LGEMC'],
    'Saudi Arabia': ['LGESJ'],    'Iran':['LGEIR'],     'Israel':['LGEYK'],     'The Republic of South Africa': ['LGESA'],
    'Tunisia': ['LGETU'],    'U.A.E': ['LGEOT', 'LGEDF', 'LGEGF', 'LGEME', 'LGEAF'],    'Nigeria': ['LGEAO', 'LGENI'],
    'Turkey': ['LGETK', 'LGEAT'],    'Australia': ['LGEAP'],
    'China': ['LGEQA', 'LGETL', 'LGECH', 'LGEYT', 'LGETR', 'LGETA', 'LGESY', 'LGESH', 'LGEQH', 'LGEQD', 'LGEPN', 'LGEND', 'LGEKS', 'LGEHZ', 'LGEHN', 'LGEHK'],
    'India': ['LGEIL'],    'Indonesia': ['LGEIN'],    'Japan': ['LGEJP'],    'Malaysia': ['LGEML'],    'Philippines': ['LGEPH'],
    'Singapore': ['LGESL'],    'Taiwan': ['LGETT'],    'Korea' :['LGEKR'],    'Thailand': ['LGETH'],    'Vietnam': ['LGEVN','LGEVH'],
     'Canada': ['LGECI'],    'Mexico': ['LGERS', 'LGEMX', 'LGEMS', 'LGEMM'],    'United States': ['LGEMR', 'LGEUS', 'LGEMU', 'LGEAI'],
    'Argentina': ['LGEAG','LGEAR'],    'Brazil': ['LGEBR','LGESP'],    'Chile': ['LGECL'],    'Colombia': ['LGEVZ', 'LGECB'],
    'Panama': ['Guatemala', 'LGEPS'],    'Peru': ['LGEPR']}
continent_nation={
    'Europe':['EUs','Austria', 'Czech Republic' ,'France' ,'Germany', 'Greece' ,'Hungary', 'Italy', 'Netherlands' ,'Poland' ,'Portugal' ,'Romania', 'Spain' ,'Sweden','United Kingdom'], 
    'Russia and CIS':['Kazakhstan','Russia', 'Ukraine', 'Latvia'],     'Africa and MiddleEast': ['Israel','Iran','Algeria', 'Egypt', 'Jordan', 'Kenya', 'Morocco','Saudi Arabia','The Republic of South Africa','Tunisia', 'U.A.E', 'Nigeria', 'Turkey'], 
    'Asia':['Korea','Australia','China','India','Indonesia','Japan','Malaysia','Philippines','Singapore','Taiwan','Thailand','Vietnam'], 
    'NorthAmerica' : ['Canada','Mexico','United States'],    'SouthAmerica' :['Argentina','Brazil','Chile','Colombia','Panama','Peru']
    
}
hemisphere = {
    'Northern': ['EUs', 'Austria', 'Czech Republic', 'France', 'Germany', 'Greece', 'Hungary', 'Italy', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Spain', 'Sweden', 'United Kingdom', 'Kazakhstan', 'Russia', 'Ukraine', 'Latvia', 'Israel', 'Iran', 'Jordan', 'Morocco', 'Saudi Arabia', 'Tunisia', 'Turkey', 'Korea', 'China', 'Japan', 'Taiwan', 'Canada', 'United States', 'Mexico', 'Panama'],
    'Southern': ['Algeria', 'Egypt', 'Kenya', 'The Republic of South Africa', 'U.A.E', 'Nigeria', 'Australia', 'India', 'Indonesia', 'Malaysia', 'Philippines', 'Singapore', 'Thailand', 'Vietnam', 'Argentina', 'Brazil', 'Chile', 'Colombia', 'Peru']
}
mapping_dict = {
#     "Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung": "Product Information",
#     "tôi cần tham khảo giá và giải pháp từ LG": "Quotation or Purchase Consultation",
#     "Vui lòng báo giá giúp mình sản phẩm đo thân nhiệt Xin cảm ơn": "Request for quotation or purchase",
#     "LED Signage": "Product Information",
#     "Standalone": "Product Information",
#     "for school": "Other",
#     "Not specified": "Other",
#     "Intégrateur historique du George V": "Other",
#     "Solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución One Quick:": "Quotation or Purchase Consultation",
#     "Pantallas Interactivas para Clinicas": "Product Information",
#     "Hotel TV products": "Product Information",
#     "VRF": "Product Information",
#     "Preciso de um monitor médico para radiografia convencional e tomogrtafia.": "Sales Inquiry",
    "others": "Other",
    "Others": "Other",
    "other_": "Other",
    "other": "Other",
    "Etc.": "ETC.",
#     "window facing product": "Product Information",
#     "Digital platform": "Product Information",
#     "(Select ID_Needs)": "Other",
#     "One Quick:Flex": "Product Information",
#     "AIO": "Product Information",
#     "Needs": "Other",
#     "Hospital TV": "Product Information",
#     "i want to know the details about it": "Product Information",
#     "EDUCATIONAL EQUIPMENTS": "Product Information",
#     "TV interactive": "Product Information",
#     "Hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en Guayaquil -Ecuador.": "Request for quotation or purchase",
#     "teach": "Other",
#     "Display Textbook and photos": "Usage or technical consultation",
#     "High inch 86 / 98 or 110": "Product Information",
#     "quotation_": "Request for quotation or purchase",
#     "display product": "Product Information",
#     "first Info and pricing": "Quotation or Purchase Consultation",
#     "estoy buscando para Ecuador este producto LG MAGNIT micro LED, para un cliente de 138 pulgadas, con envió marítimo.": "Sales Inquiry",
#     "Evento_SdelEstero": "Other",
#     "probeam precio": "Sales Inquiry",
#     "media inquiry": "Sales Inquiry",
#     "Video Wall": "Product Information",
}


In [53]:

# 데이터 생성 및 전처리 함수 
def get_datas():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("submission.csv").drop(['id','is_converted'], axis =1) # 테스트 데이터(제출파일의 데이터)
    train['is_converted']=np.where(train['is_converted']==True,1,0)
    return train, test 


def delete_cols(data, cols):
    data = data.drop(columns=cols)
    return data

def log_transform(data,cols):
    for col in cols :
        data[col+'log']=np.log1p(data[col]) 
    return data 


def eda_expected_timeline(df):
    
    def timeline_label(time):
    
        time = str(time).lower().replace(' ','').replace('_','').replace('/','').replace(',','').replace('~','').replace('&','').replace('-','').replace('.','')
        
        if time == 'lessthan3months':
            result = 'less than 3 months'
        elif time == '3months6months':
            result = '3 months ~ 6 months'
        elif time == '6months9months':
            result = '6 months ~ 9 months'
        elif time == '9months1year':
            result = '9 months ~ 1 year'
        elif time == 'morethanayear':
            result = 'more than a year'
        else:
            result = 'aimers_0203'
            
        return result
    
    df['expected_timeline'] = df['expected_timeline'].apply(timeline_label)
    
    return df


# inquiry type 전처리하기 
def eda_inquiry_type(df):
    df['inquiry_type']= df['inquiry_type'].map(mapping_dict).fillna(train['inquiry_type'])
    df.loc[df['inquiry_type'].str.contains('Solicito apoyo para realizar', na=False), 'inquiry_type'] = 'Quotation or Purchase Consultation'
    df['inquiry_type'] = df['inquiry_type'].str.lower()
    replacement = {'/': ' ', '-':' ', '_':' '}
    df['inquiry_type'].replace(replacement, regex=True, inplace=True)
    return df


#customer type 처리 
def customer_type(data):
    data['customer_type']=data['customer_type'].fillna('none') 
    return data

# total_area 변수로 통일
def eda_business_area(df):
    for col in ['business_area','business_subarea']:
        df[col] = df[col].str.lower()
        df[col] = df[col].str.replace(" ", "") 
        df[col] = df[col].str.replace(r'[^\w\s]', "") 
        df[col] = df[col].fillna('nan') 
    df['total_area'] = df['business_area'].astype(str) + df['business_subarea'].astype(str)
    return df 

# 새로운 국가명, 대륙 열을 만들기 
def get_nation_continent(df):
    nation_corp_reverse ={v:k for k , values in nation_corp.items() for v in values }
    df['nation']=df['response_corporate'].map(nation_corp_reverse)
    continent_nation_reverse ={v:k for k , values in continent_nation.items() for v in values }
    df['continent']=df['nation'].map(continent_nation_reverse)
#     df = df.drop('customer_country',axis=1) 
    return df 

#라벨 인코딩 
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)
    return series

# com_reg_ver_win_rate 최빈값으로 채우기 
def com_reg_fill(train,test):
    train['com_reg_ver_win_rate'] = train['com_reg_ver_win_rate'].fillna(train['com_reg_ver_win_rate'].mode()[0])
    test['com_reg_ver_win_rate'] = test['com_reg_ver_win_rate'].fillna(train['com_reg_ver_win_rate'].mode()[0])
    return train,test

#****************************Feature Engineering*************************************#

# area,unit,continent ->comregverwin , fe9 
# area,unit ->ver_win_ratio_per_bu
# unit, continent -> fe1 
# owner ,unit  -> fe3 
# nation, job -> fe10 

# 회사별 Quotation or Purchase Consultation,Request for Partnership의 횟수
# PortugalChina U.A.E  United States Argentina
# 회사별 관료직, 혹은 성공률 높은 이들의 빈도 
# unit, position 로 그룹화   x 
# 국가별 지점의 개수  
# 국가별 성공률이 0.1이 넘는 국가 1 나머지 0 


def fe_1(train,test):
    # unit continent으로 엮어서 영업 전환율 살펴보기 -> 'unit_conti_mean'열 새로 생성
    # 대륙별로 어느 사업부에 영업 성공율이 높은 지 
    se=train.groupby(['business_unit','continent'])['is_converted'].agg(['mean'])
    se = se.rename(columns={'mean':'unit_conti_mean'})
    train =train.merge(se, on=['business_unit','continent'], how ='left')
    test =test.merge(se, on=['business_unit','continent'], how ='left')
    return train,test 

def fe_2(train,test):
    # 영업 당담자가 어느 정도로 다양한 회사(customer_idx)을 담당하고 있는 지 
    
#     count = train.groupby('lead_owner').size().reset_index(name='leadowner_cnt')     
#     train = train.merge(count, on='lead_owner', how='left')
#     train['leadowner_cnt']= np.log1p(train['leadowner_cnt'])
#     test = test.merge(count, on='lead_owner', how= 'left')
#     test['leadowner_cnt']=np.log1p(test['leadowner_cnt'])
    unique_count = train.groupby('lead_owner')['customer_idx'].nunique().reset_index(name='unique_cusidx_cnt')
    train = train.merge(unique_count, on='lead_owner', how='left')
    test = test.merge(unique_count,on ='lead_owner',how ='left')
    train['unique_cusidx_cnt']= np.log1p(train['unique_cusidx_cnt'])
    test['unique_cusidx_cnt']= np.log1p(test['unique_cusidx_cnt'])
    
    return train, test 

def fe_3(train,test):
    # 영업담당자와 사업부로 영업전환 성공률 살펴보기 -> 어느 사업부를 어느 담당자가 담당해야 성공율이 높나 확인 

    se = train.groupby(['lead_owner','business_unit'])['is_converted'].agg(['mean']).rename(columns={'mean': 'owner_unit_mean'})
    train = train.merge(se, on=['lead_owner','business_unit'], how='left')
    test = test.merge(se, on=['lead_owner','business_unit'],how='left')
    return train, test

def fe_4(train,test):
    # customer_idx가 대기업, 중소기업으로 분류되는 경우 1을 부여 
    se = train[train.groupby('customer_idx')['enterprise'].transform('nunique') > 1]
    multi_company=list(se['customer_idx'].unique())
    train['multi_company']=np.where(train['customer_idx'].isin(multi_company) ,1,0)
    test['multi_company']=np.where(test['customer_idx'].isin(multi_company) ,1,0)
    return train, test

def fe_5(train,test):
    # LG지점 , 사업부 , bantsubmit으로 영업 성공율 살펴보기 -> 너무 과적합됨으로 제외 
    se = train.groupby(['response_corporate','business_unit','bant_submit'])['is_converted'].agg(['mean']).rename(columns={'mean':'idx_unit_mean'})
    train=train.merge(se,on=['response_corporate','business_unit','bant_submit'], how ='left')
    test=test.merge(se,on=['response_corporate','business_unit','bant_submit'], how ='left')
    return train, test
def fe_6(train,test):
    # 영업사원, 사업부, bandsubmit 으로 영업 성공율 살펴보기 -> 과적합으로 제외 
    se = train.groupby(['lead_owner','business_unit','bant_submit'])['is_converted'].agg(['mean']).rename(columns={'mean':'idx_unit_mean'})
    train=train.merge(se,on=['lead_owner','business_unit','bant_submit'], how ='left')
    test =test.merge(se,on =['lead_owner','business_unit','bant_submit'], how ='left')
    return train, test


def fe_7(train,test):
    # bant submit 제곱하기 -> isconverted와 corr는 더 높지만 성능향상은 없음 
    train['bant_submit']=train['bant_submit']*train['bant_submit']
    test['bant_submit']=test['bant_submit']*test['bant_submit']
    return train, test


def fe_8(df):
    # 국가별로 북반구와 남반구 특성을 생성하기 
    hemisphere_reverse ={v:k for k , values in hemisphere.items() for v in values }
    df['hemisphere'] =df['nation'].map(hemisphere_reverse)
    return df 

def fe_9(train,test):
    # 대륙별,사업 분야별, 사업부로 영업 성공률 살피기 
    se=train.groupby(['business_area','business_unit','continent'])['is_converted'].agg(['mean'])
    se = se.rename(columns={'mean':'area_unit_conti_mean'})
    train =train.merge(se, on=['business_area','business_unit','continent'], how ='left')
    test =test.merge(se, on=['business_area','business_unit','continent'], how ='left')
    return train, test

def fe_10(train,test):
    #국가별, 고객의 직업에 따라서 영업 성공율 살펴보기 
    se =train.groupby(['nation','customer_job'])['is_converted'].agg(['mean']).rename(columns={'mean':'nat_job_mean'})
    
    train =train.merge(se, on =['nation','customer_job'], how = 'left')
    train['nat_job_mean']=train['nat_job_mean'].fillna(train['nat_job_mean'].mean())
    
    test =test.merge(se, on =['nation','customer_job'], how = 'left')
    test['nat_job_mean']=test['nat_job_mean'].fillna(train['nat_job_mean'].mean())
    return train,test 



# def fe_11(train,test):
#     se1 =train.groupby(['business_unit','business_area'])['is_converted'].agg(['mean']).rename(columns={'mean':'new_perbu'})
#     train =train.merge(se1,on=['business_unit','business_area'], how='left')
#     test =test.merge(se1,on=['business_unit','business_area'], how='left')
#     train = train.drop('ver_win_ratio_per_bu', axis = 1)
    
#     test = test.drop('ver_win_ratio_per_bu', axis = 1)
#     return train,test 

def fe_12(train,test):
    train['com_product'] = train['product_category'].apply(lambda x: 1 if 'signage' in str(x) else 0)
    
    se= train.groupby(['customer_idx'])['com_product'].agg(['mean']).rename(columns={'mean':'com_prod_mean'})
    train = train.merge(se, on =['customer_idx'], how ='left')
    test = test.merge(se, on =['customer_idx'], how ='left')
    train= train.drop('com_product', axis= 1 )
    return train,test
    
def fe_13(train,test):
    se = train.groupby(['nation', 'inquiry_type'])['is_converted'].agg(['mean']).rename(columns={'mean':'nat_inquiry_type_mean'})
    train =train.merge(se,on=['nation', 'inquiry_type'],how='left')
    train['nat_inquiry_type_mean']=train['nat_inquiry_type_mean'].fillna(train['nat_inquiry_type_mean'].mean())
    
    test =test.merge(se,on=['nation', 'inquiry_type'],how='left')
    test['nat_inquiry_type_mean']=test['nat_inquiry_type_mean'].fillna(train['nat_inquiry_type_mean'].mean())
    return train,test

def fe_14(train,test):
    se =train.groupby(['nation'])['response_corporate'].agg(['count']).rename(columns={'count':'nr_count'})
    train = train.merge(se, on='nation', how='left')
    test  = test.merge(se,on='nation',how='left')
    return train,test 


def fe_15(train,test):
    se =train.groupby(['business_unit','continent','customer_position'])['is_converted'].agg(['mean']).rename(columns={'mean':'unit_conti_pos'})
    
    train =train.merge(se, on =['business_unit','continent','customer_position'], how = 'left')
    train['unit_conti_pos']=train['unit_conti_pos'].fillna(train['unit_conti_pos'].mean())
    test =test.merge(se, on =['business_unit','continent','customer_position'], how = 'left')
    test['unit_conti_pos']=test['unit_conti_pos'].fillna(train['unit_conti_pos'].mean())
    return train,test

def fe_16(train,test):
    se =train.groupby(['continent','total_area'])['is_converted'].agg(['mean']).rename(columns={'mean':'total_conti_pos'})
    train =train.merge(se, on =['continent','total_area'], how = 'left')
    train['total_conti_pos']=train['total_conti_pos'].fillna(train['total_conti_pos'].mean())
    test =test.merge(se, on =['continent','total_area'], how = 'left')
    test['total_conti_pos']=test['total_conti_pos'].fillna(train['total_conti_pos'].mean())
    
    return train,test

def fe_17(train,test):
    train['good']=0 
    test['good'] = 0 
    train.loc[train['nation'].isin(['Taiwan', 'Latvia','Czech Republic','China','Romania','Morocco','Portugal','Thailand','Argentina','U.A.E','United States']), 'good'] = 1
    test.loc[test['nation'].isin(['Taiwan', 'Latvia','Czech Republic','China','Romania','Morocco','Portugal','Thailand','Argentina','U.A.E','United States']), 'good'] = 1
    return train,test 

def fe_18(train,test, col1,col2):
    time_avg = train[[col2, 'is_converted']].groupby(col2).mean()
    time_avg.columns = [f'{col2}_avg']

    timeline = train.loc[train[col2] != 'weoif', col1 + [col2]]
    timeline['cnt'] = 1
    timeline_se = timeline.groupby(col1 + [col2]).count()
    timeline_se.reset_index(inplace =True)
    temp2 = pd.merge(timeline_se, time_avg, how = 'left' , on=[col2])
    temp2['multip'] = temp2['cnt'] * temp2[f'{col2}_avg']
    temp2 = temp2.groupby(col1).sum().reset_index().drop([f'{col2}_avg'], axis =1)

    temp2[f'{col2}_mean'] = temp2['multip'] / temp2['cnt']
    temp2.drop(['multip','cnt'], axis=1 , inplace= True)

    train= pd.merge(train, temp2, how ='left' , on=col1)
    test= pd.merge(test, temp2, how ='left' , on=col1)
    return train,test 


def create_grouped_features(train, test, group, numeric_var):
    # 범주형 특성들에 대해서 다른 수치형 데이터의 중앙값, 최대, 합을 새로운 열로 추가하기 
    train = train.copy()
    test = test.copy()
    aggs = ['median', 'max','sum']
    for agg in aggs:
        # groupby 후 aggregation
        a1 = train.groupby([group])[numeric_var].agg(agg).to_dict()
        # 새로운 feature 생성
        train[numeric_var+'_'+group+'_'+agg] = train[group].map(a1)
        test[numeric_var+'_'+group+'_'+agg] = test[group].map(a1)
    return train, test

def do_scale(train,test, scale_cols) :
    for c in scale_cols:
        min_value = train[c].min()
        max_value = train[c].max()
        train[c+'sc'] = (train[c] - min_value) / (max_value - min_value)
        test[c+'sc'] = (test[c] - min_value) / (max_value - min_value)
    return train,test



### 모델생성

In [54]:
groups = ['business_unit','customer_idx']
numeric_vars = ['historical_existing_cnt', 'lead_desc_length']
scale_cols = ['com_reg_ver_win_rate','historical_existing_cnt', 'lead_desc_length','ver_win_rate_x'] 

In [55]:
# data 갖고오기 
train,test= get_datas() 

# 스케일링 하기 
train,test =do_scale(train,test,scale_cols)
# 범주형 데이터에 대해 수치형 데이터 통계값 추가
for group in groups:
    for numeric_var in numeric_vars:
        train, test = create_grouped_features(train, test, group, numeric_var)
        
        
# 전처리, 로그변환 수행하기 
columns_to_log=['com_reg_ver_win_rate','lead_desc_length']
train,test= log_transform(train,columns_to_log ),log_transform(test,columns_to_log)
train,test =eda_business_area(train),eda_business_area(test)
train,test= get_nation_continent(train),get_nation_continent(test)
train,test=eda_expected_timeline(train) ,eda_expected_timeline(test)
train,test=customer_type(train) ,customer_type(test)
train,test=eda_inquiry_type(train) ,eda_inquiry_type(test)

# Feature Engineering 
train,test = fe_1(train,test)
train,test = fe_2(train,test)
train,test = fe_3(train,test)
train,test = fe_9(train,test)
train,test = fe_18(train,test, ['continent', 'bant_submit'],'inquiry_type')


for col in ['customer_idx','customer_type',]:
    train[col+'count'] =train[col].map(train[col].value_counts())
    test[col+'count'] =test[col].map(train[col].value_counts())



In [56]:
columns_to_delete=['nation']
train,test =delete_cols(train, columns_to_delete), delete_cols(test,columns_to_delete)

cols = [     'customer_country',    "business_subarea",    "business_area",    "business_unit",    "customer_type",    "enterprise",    "customer_job",    "product_category",    "product_subcategory",    "product_modelname",    "customer_position",
      'customer_country.1', "response_corporate",  
     "expected_timeline",
'nation','continent',
'total_area']
label_columns =list(set(cols)-set(columns_to_delete))

from category_encoders import CatBoostEncoder
enc = CatBoostEncoder(cols=label_columns)
enc.fit(train[label_columns], train['is_converted'])  # 'target'은 실제 데이터의 타겟 변수 이름에 맞게 변경
# 인코딩 적용
train[label_columns] = enc.transform(train[label_columns])
test[label_columns] = enc.transform(test[label_columns])

In [57]:
train = train.fillna(0)
test = test.fillna(0)

In [74]:
train.dtypes == ''

array([dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('int64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('int64'), dtype('O'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('int64'), dtype('int64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('int64'), dtype('int64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64

In [69]:
numeric = ['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'enterprise', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'inquiry_type', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
       'business_area', 'business_subarea', 'lead_owner',
       'com_reg_ver_win_ratesc', 'historical_existing_cntsc',
       'lead_desc_lengthsc', 'ver_win_rate_xsc',
       'historical_existing_cnt_business_unit_median',
       'historical_existing_cnt_business_unit_max',
       'historical_existing_cnt_business_unit_sum',
       'lead_desc_length_business_unit_median',
       'lead_desc_length_business_unit_max',
       'lead_desc_length_business_unit_sum',
       'historical_existing_cnt_customer_idx_median',
       'historical_existing_cnt_customer_idx_max',
       'historical_existing_cnt_customer_idx_sum',
       'lead_desc_length_customer_idx_median',
       'lead_desc_length_customer_idx_max',
       'lead_desc_length_customer_idx_sum', 'com_reg_ver_win_ratelog',
       'lead_desc_lengthlog', 'total_area', 'continent',
       'unit_conti_mean', 'unique_cusidx_cnt', 'owner_unit_mean',
       'area_unit_conti_mean', 'inquiry_type_mean', 'customer_idxcount',
       'customer_typecount']

In [72]:

my_model = setup(session_id=CFG.user_seed, 
                 data=train, 
                 target=CFG.target, 
                 normalize=False, normalize_method='zscore',
                 transformation=False,  
                 numeric_features = numeric,
                 fold_strategy='stratifiedkfold',
                 fold=10,
                 fold_shuffle=True,
                 fix_imbalance = True, # data imbalance 를 sampling method로 보정
                 fix_imbalance_method = imblearn.over_sampling.RandomOverSampler(),
                 use_gpu=True)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'sales inquiry'