## Train

In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import random
import os

from preprocess import preprocess

import warnings

# 모든 경고 무시
warnings.filterwarnings("ignore")

def label_encoding(series):
    # """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)
    # unique value 찾고 아스키코드 기준으로 sort
    # my_dict을 통해 unique value에 대응하는 label 생성 
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    # my_dict의 index와 매치되는 series값을 my_dict의 value로 변환한다.
    series = series.map(my_dict)

    return series

def get_clf_eval(y_test, y_pred=None):

    from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,)

    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
seed_everything()

# 데이터 불러오기

data_path = "/home/workspace/LGamiers/"
df_train = pd.read_csv(data_path + "train.csv") # 학습용 데이터
df_test = pd.read_csv(data_path + "submission.csv")

In [43]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]
df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

# label encoding
for col in label_columns:
    df_all[col] = label_encoding(df_all[col])


# test set 분리 
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [44]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=42,)

## Evaluation

In [47]:
# feature select (null 값이 하나라도 있으면 다지움)
not_null_feature = [f for f in df_train.columns if df_train[f].isnull().sum() == 0 and f != 'is_converted']
print(not_null_feature)
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train[not_null_feature].fillna(0), y_train)
pred = model.predict(x_val[not_null_feature].fillna(0))
get_clf_eval(y_val, pred)

['bant_submit', 'customer_country', 'business_unit', 'customer_idx', 'customer_type', 'enterprise', 'customer_job', 'lead_desc_length', 'inquiry_type', 'product_category', 'product_subcategory', 'product_modelname', 'customer_country.1', 'customer_position', 'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro', 'business_area', 'business_subarea', 'lead_owner']
오차행렬:
 [[  751   234]
 [  230 10645]]

정확도: 0.9609
정밀도: 0.7655
재현율: 0.7624
F1: 0.7640
