In [20]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['T[0]', 'F[1]'], columns = ['pred_T[0]', 'pred_F[1]'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [22]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [23]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/submission.csv')
df_all = pd.concat([df_train, df_test])
df_all.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,id
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,1.0,
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,1.0,
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,1.0,
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,1.0,
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,1.0,


### 나라 전처리

In [24]:
df_all['customer_country'] = df_all['customer_country'].str.lower()
country = df_all['customer_country'].apply(lambda x : str(x).split('/')[-1].strip()).unique()
country = pd.DataFrame(country, columns=['country'])
country

Unnamed: 0,country
0,philippines
1,india
2,nigeria
3,saudi arabia
4,singapore
...,...
545,233 south beaudry avenue los angeles ca
546,nj
547,"3 center plz suite 330 boston, ma 02108"
548,nm


In [25]:
country['country'] = country['country'].apply(lambda x : 'usa' if 'united states' in x else x)
country['country'] = country['country'].apply(lambda x : 'usa' if 'usa' in x else x)
country['country'] = country['country'].apply(lambda x : 'etc' if '@' in x else x)
country['country'] = country['country'].replace(['us', 'usa'], 'usa')
country.drop_duplicates(inplace= True)
country.reset_index(drop = True, inplace = True)

In [26]:
# // [-1] 만 전처리한 것
country1 = pd.read_csv('country1.csv', encoding = 'cp949')
country['country1'] = country1['country1']
country['country1'].fillna(country['country'], inplace = True)
country

Unnamed: 0,country,country1
0,philippines,philippines
1,india,india
2,nigeria,nigeria
3,saudi arabia,saudi arabia
4,singapore,singapore
...,...,...
423,233 south beaudry avenue los angeles ca,usa
424,nj,nj
425,"3 center plz suite 330 boston, ma 02108",usa
426,nm,nm


In [27]:
df_all['country'] = df_all['customer_country'].apply(lambda x : str(x).split('/')[-1].strip())
df_all['country'] = df_all['country'].apply(lambda x : 'usa' if 'united states' in x else x)
df_all['country'] = df_all['country'].apply(lambda x : 'usa' if 'usa' in x else x)
df_all['country'] = df_all['country'].apply(lambda x : 'etc' if '@' in x else x)
df_all['country'] = df_all['country'].replace(['us', 'usa'], 'usa')
df_all[['customer_country','country']]

Unnamed: 0,customer_country,country
0,/quezon city/philippines,philippines
1,/ph-00/philippines,philippines
2,/kolkata /india,india
3,/bhubaneswar/india,india
4,/hyderabad/india,india
...,...,...
5266,/são paulo/brazil,brazil
5267,general / / united states,usa
5268,/ ouro branco / brazil,brazil
5269,/ / germany,germany


In [28]:
# df_all['country'].unique() == country['country'].values # 전부 True

In [29]:
df_all = pd.merge(df_all, country, how = 'left', on = 'country')
df_all.drop(columns = 'country', inplace = True)
df_all[['customer_country','country1']]

Unnamed: 0,customer_country,country1
0,/quezon city/philippines,philippines
1,/ph-00/philippines,philippines
2,/kolkata /india,india
3,/bhubaneswar/india,india
4,/hyderabad/india,india
...,...,...
64565,/são paulo/brazil,brazil
64566,general / / united states,usa
64567,/ ouro branco / brazil,brazil
64568,/ / germany,germany


In [30]:
temp = df_all[['customer_country', 'country1']]

In [31]:
temp['country1'].unique()

array(['philippines', 'india', 'nigeria', 'saudi arabia', 'singapore',
       'brazil', 'uae', 'south africa', 'usa', 'colombia', 'mexico',
       'ghana', 'egypt', 'congo', 'ethiopia', 'australia', 'nan', 'kenya',
       'indonesia', 'oman', 'pakistan', 'united kingdom', 'guatemala',
       'panama', 'canada', 'bangladesh', 'papua new guinea',
       'united republic of tanzania', 'qatar', 'afghanistan', 'chile',
       'mozambique', 'turkey', 'el salvador', 'togo', 'jordan', 'iraq',
       'israel', 'sri lanka', 'south korea', 'portugal', 'mauritania',
       'uruguay', 'peru', 'germany', 'romania', 'norway', 'jamaica', '',
       'hungary', 'poland', 'czech', 'spain', 'argentina', 'ecuador',
       'senegal', 'hong kong', 'malaysia', 'japan', 'kuwait', 'ireland',
       'albania', 'greece', 'algeria', 'nicaragua', 'slovenia', 'italy',
       'netherlands', 'dominican republic', 'france', 'etc', 'uganda',
       'iran', 'paraguay', 'bolivia', 'namibia', 'tunisia', 'puerto rico',
    

In [32]:
unknown = ['', 'br', 'nd', 'country', '5555', 'a', '48201', 'rj', 'ny', 'ne', 'nj', 'nm']

In [33]:
# temp[temp['country1'].isin(unknown)].drop_duplicates().to_csv('unknown_country.csv', index = 0)

In [34]:
unknown_country = pd.read_csv('unknown_country.csv', encoding='cp949')
unknown_country.head()

Unnamed: 0,customer_country,country1
0,//,etc
1,18000 w 9 mile rd /southfield/,usa
2,//india//,india
3,9365 counselors row /indianapolis/,usa
4,//india// maharashtra/,inda


In [35]:
dict_b = pd.Series(unknown_country.country1.values, index = unknown_country.customer_country).to_dict()

temp['country1'] = temp['customer_country'].map(dict_b).fillna(temp['country1'])
temp

Unnamed: 0,customer_country,country1
0,/quezon city/philippines,philippines
1,/ph-00/philippines,philippines
2,/kolkata /india,india
3,/bhubaneswar/india,india
4,/hyderabad/india,india
...,...,...
64565,/são paulo/brazil,brazil
64566,general / / united states,usa
64567,/ ouro branco / brazil,brazil
64568,/ / germany,germany


In [36]:
df_all['country1'] = temp['country1']

In [37]:
df_all.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,id,country1
0,1.0,/quezon city/philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,1,0,0.003079,0.026846,corporate / office,Engineering,0,1.0,,philippines
1,1.0,/ph-00/philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,1,0,0.003079,0.026846,corporate / office,Advertising,1,1.0,,philippines
2,1.0,/kolkata /india,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,1,0,0.003079,0.026846,corporate / office,Construction,2,1.0,,india
3,1.0,/bhubaneswar/india,AS,0.088889,4919,End-Customer,Enterprise,,,,...,1,0,0.003079,0.026846,corporate / office,IT/Software,3,1.0,,india
4,1.0,/hyderabad/india,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,0,0,0.003079,0.026846,corporate / office,,4,1.0,,india


In [43]:
df_all['country1'].unique()

array(['philippines', 'india', 'nigeria', 'saudi arabia', 'singapore',
       'brazil', 'uae', 'south africa', 'usa', 'colombia', 'mexico',
       'ghana', 'egypt', 'congo', 'ethiopia', 'australia', 'nan', 'kenya',
       'indonesia', 'oman', 'pakistan', 'united kingdom', 'guatemala',
       'panama', 'canada', 'bangladesh', 'papua new guinea',
       'united republic of tanzania', 'qatar', 'afghanistan', 'chile',
       'mozambique', 'turkey', 'el salvador', 'togo', 'jordan', 'iraq',
       'israel', 'sri lanka', 'south korea', 'portugal', 'mauritania',
       'uruguay', 'peru', 'germany', 'romania', 'norway', 'jamaica',
       'etc', 'hungary', 'poland', 'czech', 'spain', 'argentina',
       'ecuador', 'senegal', 'hong kong', 'malaysia', 'japan', 'kuwait',
       'ireland', 'albania', 'greece', 'algeria', 'nicaragua', 'slovenia',
       'italy', 'netherlands', 'dominican republic', 'france', 'uganda',
       'iran', 'paraguay', 'bolivia', 'namibia', 'tunisia', 'puerto rico',
       '

In [7]:
# col = ['customer_country', 'customer_idx', 'customer_job', 'product_category', 
#        'product_subcategory', 'product_modelname', 'customer_country.1', 'expected_timeline', 
#        'business_area', 'business_subarea', 'lead_owner']
drop_col = ['customer_country.1', 'customer_idx']
df_all.drop(columns=drop_col, inplace = True)

In [8]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]


for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [9]:
df_train = df_all.iloc[: len(df_train)]
df_test = df_all.iloc[len(df_train) :]

In [12]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [13]:
# 언더 샘플링
X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_resample(df_train.drop(["is_converted", 'id'], axis=1), df_train["is_converted"].astype(int))

x_train, x_val, y_train, y_val = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    random_state=400,
)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop(["is_converted", 'id'], axis=1),
    df_train["is_converted"].astype(int),
    test_size=0.2,
    stratify = df_train["is_converted"],
    random_state=400,
)

In [14]:
y_train.sum(), y_train.count() # 언더샘플링 됐는지 확인

(3884, 7760)

## 모델 학습

In [55]:
model = DecisionTreeClassifier(random_state = 42)
model.fit(x_train.fillna(0), y_train)

y_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[817 149]
 [166 808]]

정확도: 0.8376
정밀도: 0.8311
재현율: 0.8458
F1: 0.8384


In [56]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = model.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(2228, 5271)

###  랜덤포레스트

In [68]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)
rf_clf.fit(x_train.fillna(0), y_train)

y_pred = rf_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[907  59]
 [142 832]]

정확도: 0.8964
정밀도: 0.8646
재현율: 0.9389
F1: 0.9002


In [59]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = rf_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(2021, 5271)

In [None]:
help(RandomForestClassifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators' : np.arange(100, 501, 100),  'max_samples' : [0.8, 1], 'max_features' : [0.8, 1],
              'class_weight' : [{0: 5, 1:1}, {0:1, 1:1}],  'criterion' : ["gini", "entropy", "log_loss"]}

grid_rf_clf = GridSearchCV(rf_clf, param_grid, verbose = True, scoring='f1')
grid_rf_clf.fit(x_train.fillna(0), y_train)

print('최적의 파라미터 :', grid_rf_clf.best_params_) # 9, 1, 400

In [69]:
y_pred = grid_rf_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[906  60]
 [139 835]]

정확도: 0.8974
정밀도: 0.8670
재현율: 0.9379
F1: 0.9010


In [70]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_rf_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(1990, 5271)

### 여러 분류기 실험

In [17]:
y_train.sum(), y_train.count()

(3884, 7760)

In [47]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

bag = BaggingClassifier(random_state=42)
bag.fit(x_train.fillna(0), y_train)

y_pred = bag.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = bag.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[832 134]
 [135 839]]

정확도: 0.8613
정밀도: 0.8604
재현율: 0.8613
F1: 0.8608


(1790, 5271)

In [28]:
help(BaggingClassifier())

Help on BaggingClassifier in module sklearn.ensemble._bagging object:

class BaggingClassifier(sklearn.base.ClassifierMixin, BaseBagging)
 |  BaggingClassifier(estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0, base_estimator='deprecated')
 |  
 |  A Bagging classifier.
 |  
 |  A Bagging classifier is an ensemble meta-estimator that fits base
 |  classifiers each on random subsets of the original dataset and then
 |  aggregate their individual predictions (either by voting or by averaging)
 |  to form a final prediction. Such a meta-estimator can typically be used as
 |  a way to reduce the variance of a black-box estimator (e.g., a decision
 |  tree), by introducing randomization into its construction procedure and
 |  then making an ensemble out of it.
 |  
 |  This algorithm encompasses several works from the literature. When random
 |  subsets

In [32]:
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(random_state=42, oob_score = True)
param_grid = {'n_estimators' : np.arange(200, 501, 100),  'max_samples' : [0.8], 'max_features' : [0.8]}

grid_bag = GridSearchCV(bag, param_grid, verbose = True, scoring='f1')
grid_bag.fit(x_train.fillna(0), y_train)

print('최적의 파라미터 :', grid_bag.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
최적의 파라미터 : {'max_features': 0.8, 'max_samples': 0.8, 'n_estimators': 500}


In [46]:
y_pred = grid_bag.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_bag.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[915  51]
 [134 840]]

정확도: 0.9046
정밀도: 0.8723
재현율: 0.9472
F1: 0.9082


(2003, 5271)

In [48]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(x_train.fillna(0), y_train)

y_pred = rf_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = rf_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[904  62]
 [138 836]]

정확도: 0.8969
정밀도: 0.8676
재현율: 0.9358
F1: 0.9004


(2036, 5271)

In [45]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=300, criterion = 'entropy', 
                                max_features = 0.8, max_samples = 0.8, oob_score = True)
rf_clf.fit(x_train.fillna(0), y_train)

y_pred = rf_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = rf_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[909  57]
 [141 833]]

정확도: 0.8979
정밀도: 0.8657
재현율: 0.9410
F1: 0.9018


(2020, 5271)

In [49]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(x_train.fillna(0), y_train)

y_pred = ada.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = ada.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[787 179]
 [196 778]]

정확도: 0.8067
정밀도: 0.8006
재현율: 0.8147
F1: 0.8076


(1745, 5271)

In [37]:
help(AdaBoostClassifier())

Help on AdaBoostClassifier in module sklearn.ensemble._weight_boosting object:

class AdaBoostClassifier(sklearn.base.ClassifierMixin, BaseWeightBoosting)
 |  AdaBoostClassifier(estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None, base_estimator='deprecated')
 |  
 |  An AdaBoost classifier.
 |  
 |  An AdaBoost [1] classifier is a meta-estimator that begins by fitting a
 |  classifier on the original dataset and then fits additional copies of the
 |  classifier on the same dataset but where the weights of incorrectly
 |  classified instances are adjusted such that subsequent classifiers focus
 |  more on difficult cases.
 |  
 |  This class implements the algorithm known as AdaBoost-SAMME [2].
 |  
 |  Read more in the :ref:`User Guide <adaboost>`.
 |  
 |  .. versionadded:: 0.14
 |  
 |  Parameters
 |  ----------
 |  estimator : object, default=None
 |      The base estimator from which the boosted ensemble is built.
 |      Support for sampl

In [52]:
ada = AdaBoostClassifier(random_state=42)
param_grid = {'n_estimators' : [800, 900, 1000, 1100], 'algorithm' : ['SAMME.R'], 'learning_rate' : [1]}

grid_ada = GridSearchCV(ada, param_grid, verbose = True, scoring='f1')
grid_ada.fit(x_train.fillna(0), y_train)

print('최적의 파라미터 :', grid_ada.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
최적의 파라미터 : {'algorithm': 'SAMME.R', 'learning_rate': 1, 'n_estimators': 1000}


In [53]:
y_pred = grid_ada.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_ada.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[847 119]
 [174 800]]

정확도: 0.8490
정밀도: 0.8296
재현율: 0.8768
F1: 0.8525


(1405, 5271)

In [57]:
xgb = XGBClassifier(random_state=42)
xgb.fit(x_train.fillna(0), y_train)

y_pred = xgb.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = xgb.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[902  64]
 [126 848]]

정확도: 0.9021
정밀도: 0.8774
재현율: 0.9337
F1: 0.9047


(1818, 5271)

In [55]:
xgb = XGBClassifier(random_state=42)
param_grid = {'n_estimators' : [400, 500, 600, 700], 'learning_rate' : np.arange(0.07, 0.12, 0.02),
             'subsample' : [0.8], 'colsample_bytree' : [0.8]}

grid_xgb = GridSearchCV(xgb, param_grid, verbose = True, scoring='f1')
grid_xgb.fit(x_train.fillna(0), y_train)

print('최적의 파라미터 :', grid_xgb.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
최적의 파라미터 : {'colsample_bytree': 0.8, 'learning_rate': 0.09000000000000001, 'n_estimators': 500, 'subsample': 0.8}


In [56]:
y_pred = grid_xgb.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_xgb.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[901  65]
 [122 852]]

정확도: 0.9036
정밀도: 0.8807
재현율: 0.9327
F1: 0.9060


(1774, 5271)

In [58]:
cat = CatBoostClassifier(random_state=42, verbose=False)
cat.fit(x_train.fillna(0), y_train)

y_pred = cat.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = cat.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[906  60]
 [123 851]]

정확도: 0.9057
정밀도: 0.8805
재현율: 0.9379
F1: 0.9083


(1920, 5271)

In [66]:
cat = CatBoostClassifier(random_state=42, verbose = False)
param_grid = {'iterations' : [700, 800, 900, 1000], 'learning_rate' : np.arange(0.07, 0.14, 0.02),
             'subsample' : [0.8], 'colsample_bylevel' : [0.8]}

grid_cat = GridSearchCV(cat, param_grid, verbose = True, scoring='f1')
grid_cat.fit(x_train.fillna(0), y_train.astype(int))

print('최적의 파라미터 :', grid_cat.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
최적의 파라미터 : {'colsample_bylevel': 0.8, 'iterations': 800, 'learning_rate': 0.09000000000000001, 'subsample': 0.8}


In [67]:
y_pred = grid_cat.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_cat.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[902  64]
 [118 856]]

정확도: 0.9062
정밀도: 0.8843
재현율: 0.9337
F1: 0.9084


(1833, 5271)

In [68]:
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(x_train.fillna(0), y_train)

y_pred = lgbm.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = lgbm.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
 [[914  52]
 [125 849]]

정확도: 0.9088
정밀도: 0.8797
재현율: 0.9462
F1: 0.9117


(1614, 5271)

In [None]:
lgbm = LGBMClassifier(random_state = 42, verbose = 0, force_row_wise = True)
param_grid = {'learning_rate': np.arange(0.07, 0.14, 0.02), 'n_estimators': [300, 400, 500, 600],
              'colsample_bytree': [0.8, 1.0],'subsample': [0.8, 1.0]}

grid_lgbm = GridSearchCV(lgbm, param_grid, verbose = True, scoring='f1')
grid_lgbm.fit(x_train.fillna(0), y_train)

print('최적의 파라미터 :', grid_lgbm.best_params_)

In [27]:
vote = VotingClassifier(estimators= [('xgb',xgb), ('rf_clf', rf_clf), ('cat', cat), ('lgbm', lgbm)], voting= 'soft')
vote.fit(x_train.fillna(0), y_train)

y_pred = vote.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[921  45]
 [124 850]]

정확도: 0.9129
정밀도: 0.8813
재현율: 0.9534
F1: 0.9160
