In [33]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [34]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [35]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/submission.csv')
df_all = pd.concat([df_train, df_test])
df_all.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,id
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,1.0,
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,1.0,
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,1.0,
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,1.0,
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,1.0,


In [73]:
temp = pd.read_csv('data/train.csv')['customer_country'].str.split('/', n=2, expand=True)
temp

Unnamed: 0,0,1,2
0,,Quezon City,Philippines
1,,PH-00,Philippines
2,,Kolkata,India
3,,Bhubaneswar,India
4,,Hyderabad,India
...,...,...,...
59294,,Sląskie,Poland
59295,,Bogotá DC,Colombia
59296,,Pisco,Peru
59297,,santa cruz bolivia,Peru


In [74]:
temp.isnull().sum()

0     982
1    2181
2    2728
dtype: int64

In [36]:
# col = ['customer_country', 'customer_idx', 'customer_job', 'product_category', 
#        'product_subcategory', 'product_modelname', 'customer_country.1', 'expected_timeline', 
#        'business_area', 'business_subarea', 'lead_owner']
drop_col = ['customer_country.1', 'customer_idx']
df_all.drop(columns=drop_col, inplace = True)

In [37]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]


for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [38]:
df_train = df_all.iloc[: len(df_train)]
df_test = df_all.iloc[len(df_train) :]

In [54]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# 언더 샘플링
X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_resample(df_train.drop(["is_converted", 'id'], axis=1), df_train["is_converted"].astype(int))

x_train, x_val, y_train, y_val = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    random_state=400,
)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop(["is_converted", 'id'], axis=1),
    df_train["is_converted"].astype(int),
    test_size=0.2,
    stratify = df_train["is_converted"],
    random_state=400,
)

In [57]:
y_train.sum(), y_train.count() # 언더샘플링 됐는지 확인

(3884, 7760)

## 모델 학습

In [55]:
model = DecisionTreeClassifier(random_state = 42)
model.fit(x_train.fillna(0), y_train)

y_pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[817 149]
 [166 808]]

정확도: 0.8376
정밀도: 0.8311
재현율: 0.8458
F1: 0.8384


In [56]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = model.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(2228, 5271)

###  랜덤포레스트

In [68]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)
rf_clf.fit(x_train.fillna(0), y_train)

y_pred = rf_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[907  59]
 [142 832]]

정확도: 0.8964
정밀도: 0.8646
재현율: 0.9389
F1: 0.9002


In [59]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = rf_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(2021, 5271)

In [None]:
help(RandomForestClassifier)

In [64]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators' : np.arange(100, 500, 100),  'max_samples' : [0.8, 1], 'max_features' : [0.8, 1],
              'class_weight' : [{0: 5, 1:1}, {0:1, 1:1}],  'criterion' : ["gini", "entropy", "log_loss"]}

grid_rf_clf = GridSearchCV(rf_clf, param_grid, verbose = True, scoring='f1')
grid_rf_clf.fit(x_train.fillna(0), y_train)

print('최적의 파라미터 :', grid_rf_clf.best_params_) # 9, 1, 400

Fitting 5 folds for each of 96 candidates, totalling 480 fits
최적의 파라미터 : {'class_weight': {0: 1, 1: 1}, 'criterion': 'entropy', 'max_features': 0.8, 'max_samples': 0.8, 'n_estimators': 200}


In [69]:
y_pred = grid_rf_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, y_pred)

오차행렬:
 [[906  60]
 [139 835]]

정확도: 0.8974
정밀도: 0.8670
재현율: 0.9379
F1: 0.9010


In [70]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_rf_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(1990, 5271)

### MLP

In [29]:
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(activation='relu', hidden_layer_sizes=(20,), learning_rate='invscaling', solver='sgd')
mlp_clf.fit(x_train.fillna(0), y_train)

y_pred = mlp_clf.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  602   368]
 [  390 10500]]

정확도: 0.9361
정밀도: 0.6069
재현율: 0.6206
F1: 0.6137


In [30]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = mlp_clf.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(31, 5271)

In [13]:
param_grid = {'activation' : ['relu', 'tanh'], 'hidden_layer_sizes' : [(5,), (10,), (20,), (30,), (40,)],
              'solver' : ['adam', 'sgd']}

from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
grid_mlp = GridSearchCV(mlp, param_grid, scoring='f1', verbose = True)
grid_mlp.fit(x_train.fillna(0), y_train)

grid_mlp.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


{'activation': 'relu', 'hidden_layer_sizes': (20,), 'solver': 'adam'}

In [14]:
y_pred = grid_mlp.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  602   368]
 [  390 10500]]

정확도: 0.9361
정밀도: 0.6069
재현율: 0.6206
F1: 0.6137


In [15]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_mlp.predict(x_test.fillna(0))
sum(test_pred), len(test_pred) # True로 예측된 개수

(291, 5271)

### logistic