## Train

In [67]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import random
import os

# from preprocess import preprocess

import warnings

# 모든 경고 무시
warnings.filterwarnings("ignore")

def label_encoding(series):
    # """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)
    # unique value 찾고 아스키코드 기준으로 sort
    # my_dict을 통해 unique value에 대응하는 label 생성 
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    # my_dict의 index와 매치되는 series값을 my_dict의 value로 변환한다.
    series = series.map(my_dict)

    return series

def get_clf_eval(y_test, y_pred=None):

    from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,)

    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
seed_everything()

# 데이터 불러오기

data_path = "/home/workspace/LGamiers/"
df_train = pd.read_csv(data_path + "train.csv") # 학습용 데이터
df_test = pd.read_csv(data_path + "submission.csv")

In [68]:
# preprocess 함수 사용할 것
# 예시 코드 (같은 폴더 내)
# from preprocess import preprocess
# df_train = preprocess(df_train)


# 열 삭제
# 'customer_country.1','product_subcategory','product_modelname','id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver'

def eda_drop_column(df):
    
    df = df.drop(['customer_country.1','product_subcategory','product_modelname','id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver'],axis=1)
    
    return df


# expected_timeline

def eda_expected_timeline(df):
    
    def timeline_label(time):
    
        time = str(time).lower().replace(' ','').replace('_','').replace('/','').replace(',','').replace('~','').replace('&','').replace('-','').replace('.','')
        
        if time == 'lessthan3months':
            result = 'less than 3 months'
        elif time == '3months6months':
            result = '3 months ~ 6 months'
        elif time == '6months9months':
            result = '6 months ~ 9 months'
        elif time == '9months1year':
            result = '9 months ~ 1 year'
        elif time == 'morethanayear':
            result = 'more than a year'
        else:
            result = 'aimer_0203'
            
        return result
    
    df['expected_timeline'] = df['expected_timeline'].apply(timeline_label)
    
    return df


# inquiry_type

def eda_inquiry_type_customer_position(df):
# inquiry_type feature 전처리 
    # customer_position feature 전처리 
    df['inquiry_type'] = df['inquiry_type'].str.lower()
    df['customer_position'] = df['customer_position'].str.lower()
    
    # 특수문자를 대체 문자열 지정
    replacement = {'/': ' ', '-':' ', '_':' '}
    # replace() 함수를 사용하여 특수 문자 대체
    df['inquiry_type'].replace(replacement, regex=True, inplace=True)
    df['customer_position'].replace(replacement, regex=True, inplace=True)
    
    # value count의 값이 1개인 feature name extraction (해당 방식 토의) 
    inquiry_series = df['inquiry_type'].value_counts()
    customer_position_series = df['customer_position'].value_counts()
    inquiry_replace_feature = []
    customer_replace_feature = []

    for idx, feature in enumerate(inquiry_series.index):
        if inquiry_series[idx] == 1:
            inquiry_replace_feature.append(feature)

    for idx, feature in enumerate(customer_position_series.index):
        if customer_position_series[idx] == 1:
            customer_replace_feature.append(feature)

    # 총 39개의 데이터 Concat
    # df['inquiry_type'] = df['inquiry_type'].apply(lambda x: 'aimers_0203' if x in inquiry_replace_feature else x)
    # 총 53개의 데이터 Concat 
    # df['customer_position'] = df['customer_position'].apply(lambda x: 'aimers_0203' if x in customer_replace_feature else x)

    return df


# business_area, business_subarea
# total_area 변수로 통일

def eda_business_area(df):
    
    for col in ['business_area','business_subarea']:
        
        df[col] = df[col].str.lower()
        df[col] = df[col].str.replace(" ", "") 
        df[col] = df[col].str.replace(r'[^\w\s]', "") 
        df[col] = df[col].fillna('nan') 
        
    df['total_area'] = df['business_area'].astype(str) + df['business_subarea'].astype(str)
    
    return df 


def preprocess(df):
    
    # df = eda_drop_column(df)
    df = eda_expected_timeline(df)
    df = eda_inquiry_type_customer_position(df)
    # df = eda_business_area(df)
    
    return df


In [69]:
df_train = preprocess(df_train)
df_test = preprocess(df_test)

In [70]:
df_train.columns

Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

In [71]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

# label encoding
for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

# test set 분리 
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [72]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=42,)

## Evaluation

In [73]:
# 이 결과는 inquery type만 사용함 
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train.fillna(0), y_train)
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  775   210]
 [  232 10643]]

정확도: 0.9627
정밀도: 0.7696
재현율: 0.7868
F1: 0.7781
