# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

In [65]:
# 출력 옵션 설정
pd.set_option('display.max_columns', None)  # 모든 열 출력

In [149]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### 데이터 셋 읽어오기

In [150]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [151]:
# 	/Quezon City/Philippines -> Philippines 변경
df_train['customer_country'] = df_train['customer_country'].apply(lambda x: x.rsplit('/', 1)[-1].strip() if isinstance(x, str) else x)
df_train['customer_country'] = df_train['customer_country.1'].apply(lambda x: x.rsplit('/', 1)[-1].strip() if isinstance(x, str) else x)

# 	/Quezon City/Philippines -> Philippines 변경
df_test['customer_country'] = df_test['customer_country'].apply(lambda x: x.rsplit('/', 1)[-1].strip() if isinstance(x, str) else x)
df_test['customer_country'] = df_test['customer_country.1'].apply(lambda x: x.rsplit('/', 1)[-1].strip() if isinstance(x, str) else x)

In [152]:
df_all = pd.concat([df_train, df_test])
df_all.shape, df_train.shape, df_test.shape

((64570, 30), (59299, 29), (5271, 30))

In [153]:
# 열 삭제
drop_col = ['customer_country.1', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
            'product_subcategory', 'product_modelname', 'business_area', 'business_subarea', 'ver_cus', 'ver_pro']

df_all.drop(columns = drop_col, inplace = True)
df_all.isnull().sum()

bant_submit                    0
customer_country             982
business_unit                  0
com_reg_ver_win_rate       48214
customer_idx                   0
customer_type              45418
enterprise                     0
historical_existing_cnt    49539
customer_job               20172
lead_desc_length               0
inquiry_type                2233
product_category           21232
customer_position              0
response_corporate             0
expected_timeline          33271
ver_win_rate_x             43780
ver_win_ratio_per_bu       47360
lead_owner                     0
is_converted                   0
id                         59299
dtype: int64

In [154]:
# 결측값 0으로 넣을 컬럼
fillna_col = ['com_reg_ver_win_rate', 'historical_existing_cnt',
             'ver_win_rate_x', 'ver_win_ratio_per_bu']

for col in fillna_col:
    df_all[col] = df_all[col].fillna(0)

df_all.isnull().sum()

bant_submit                    0
customer_country             982
business_unit                  0
com_reg_ver_win_rate           0
customer_idx                   0
customer_type              45418
enterprise                     0
historical_existing_cnt        0
customer_job               20172
lead_desc_length               0
inquiry_type                2233
product_category           21232
customer_position              0
response_corporate             0
expected_timeline          33271
ver_win_rate_x                 0
ver_win_ratio_per_bu           0
lead_owner                     0
is_converted                   0
id                         59299
dtype: int64

In [155]:
df_all['customer_type'] = df_all['customer_type'].str.lower()
df_all['customer_type'] = df_all['customer_type'].replace(['etc.', 'other', 'others'], 'etc')
df_all['customer_type'] = df_all['customer_type'].replace(['end-customer', 'end customer', 'end-user'], 'end_user')
df_all['customer_type'] = df_all['customer_type'].replace(['specifier/ influencer', 'specifier / influencer'], 'influencer')
df_all['customer_type'] = df_all['customer_type'].replace(['homeowner', 'home owner'], 'home_owner')
df_all['customer_type'] = df_all['customer_type'].replace(['software/solution provider', 'software / solution provider'], 'solution_provider')
df_all['customer_type'] = df_all['customer_type'].replace(['engineer', 'hvac engineer'], 'engineer')
df_all['customer_type'] = df_all['customer_type'].replace(['distributor', 'dealer/distributor'], 'distributor')
df_all['customer_type'].fillna('etc', inplace = True)

In [156]:
# value_counts 가 1개인 것들 etc로 분류
value_counts = df_all['customer_type'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_type'] = df_all['customer_type'].apply(lambda x: 'etc' if x in values_to_replace else x)

In [157]:
df_all['inquiry_type'] = df_all['inquiry_type'].str.lower()
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['etc.', 'other', 'other_', 'others'], 'etc')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['sales inquiry', 'sales', 'probeam precio'], 'sales inquiry')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['quotation or purchase consultation', 'request for quotation or purchase', 'quotation_or_purchase_consultation', 'purchase or quotation', 'quotation_', 'purchase'], 'quotation_or_purchase_consultation')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['usage or technical consultation', 'technical consultation', 'request for technical consulting', 'usage_or_technical_consultation', 'technical_consultation', 'technical'], 'usage or technical consultation')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['vui lòng báo giá giúp mình sản phẩm đo thân nhiệt xin cảm ơn', 'tôi cần tham khảo giá và giải pháp từ lg'], 'quotation_or_purchase_consultation')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung'], 'product information')

In [158]:
# value_counts 가 1개인 것들 etc로 분류
value_counts = df_all['inquiry_type'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['inquiry_type'] = df_all['inquiry_type'].apply(lambda x: 'etc' if x in values_to_replace else x)

In [159]:
df_all['inquiry_type'].fillna('etc', inplace = True)

In [160]:
# custoper_position 전처리
df_all['customer_position'] = df_all['customer_position'].replace(['ceo/founder', 'partner', 'vice president', 'c-level executive', 'director', 'vicepresident', 'c-levelexecutive', 'vp','leadership/executive office/owner', 'president'
                                                                   'principal & director', 'business partner', 'chairman', 'co-founder', 'chief executive officer', 'subsidiary sales (ise)', 'ceo/fundador', 'gerente', 'the big boss',
                                                                  'principal & director', 'president'], 'ceo')
df_all['customer_position'] = df_all['customer_position'].replace(['consultant', 'commercial consultant', 'architecture/consult', 'architect/consultant'], 'consult')
df_all['customer_position'] = df_all['customer_position'].replace(['customer', 'customer_position'], 'customer')
df_all['customer_position'] = df_all['customer_position'].replace(['decision-influencer', 'decision maker', 'decision influencer'], 'decision')
df_all['customer_position'] = df_all['customer_position'].replace(['distributor', 'cargo'], 'distributor')
df_all['customer_position'] = df_all['customer_position'].replace(['physics teacher','assistant professor','maths lecturer','science teacher','guest faculty','physics faculty','teacher/middle school coordinator','prof.',
                                                                   'academic specialist','principal at oxford integrated pu science college','math and physics teacher','professor of mathematics','physics and mathematics teacher',
                                                                   'assistant professor of english','educator','professor','quantitative aptitude faculty','english trainer for ielts,toefl,pte,gre,sat exams.','associate professor',
                                                                   'pgt physics','education professional','chemistry teacher','director cum faculty at gaining apex coaching centre','teacher','senior lecturer',
                                                                   'neet/ olympiad expert faculty','associate professor in electronics engg','education','pgt chemistry', 
                                                                   'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)','assistant professor of enlish'], 'education')
df_all['customer_position'] = df_all['customer_position'].replace(['entry level','intern','trainee','entrylevel','employee'], 'employee')

df_all['customer_position'] = df_all['customer_position'].replace(['exhibition','exhibitiontv'], 'exhibition')
df_all['customer_position'] = df_all['customer_position'].replace(['hospital', 'medical device manufacturer', 'medical imaging specialist', 'tierarzt', 'surgery professional', 'pathologist'], 'medical')
df_all['customer_position'] = df_all['customer_position'].replace(['not applicable','none','other','others'], 'etc')
df_all['customer_position'] = df_all['customer_position'].replace(['manager','associate/analyst','consulting','lider de desarrollo','decision-maker','business unit director','business development','operations',
                                                                   'product management','market intelligence/research'], 'pm')
df_all['customer_position'] = df_all['customer_position'].replace(['asst prof.','professional trainer','radiology professional'], 'professional')
df_all['customer_position'] = df_all['customer_position'].replace(['this is a consume display requirement for home purpose.','unpaid','homeowner','no influence'], 'unemployed')

In [161]:
# value_counts가 1인거 etc로 묶기
value_counts = df_all['customer_position'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_position'] = df_all['customer_position'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['customer_position'].fillna('etc', inplace = True)

In [162]:
print(len(df_all.customer_position.unique()))
df_all.customer_position.unique()

27


array(['employee', 'ceo', 'pm', 'etc', 'unemployed', 'education',
       'av management', 'founder', 'engineering', 'installer', 'consult',
       'commercial end-user', 'bulgaria', 'administrative',
       'entrepreneurship', 'decision', 'customer', 'exhibition',
       'medical', 'end-user', 'government', 'manufacturer',
       'software /solution provider', 'system integrator', 'distributor',
       'other - please specify - cedia association', 'professional'],
      dtype=object)

In [163]:
print(len(df_all.product_category.unique()))


360


In [164]:
# product_category 전처리

df_all['product_category'] = df_all['product_category'].replace([
    'aire acondicionado residencial',
    'systèmes de débit à réfrigérant variable (drv)',
    'climatiseur résidentiel',
    'residential air conditioner',
    'solar,chiller',
    'system ac,rac',
    'pendingin',
    'ar condicionado residencial',
    'vrf,multi-split',
    'vrf,multi-split,chiller',
    'vrf,multi-split,single-split,chiller,heating',
    'system air conditioner',
    'system ac,chiller',
    'system ac'
], 'air_conditioner')

df_all['product_category'] = df_all['product_category'].replace([
    'aircare,built-in/cooking',
    'lg home bliss air solution',
    'lg paradise air solution',
    'chiller,aircare',
    'aircare,water care',
    'solar,aircare'
], 'air_care')

df_all['product_category'] = df_all['product_category'].replace([
    'signage care solution',
    'signage care solutions'
], 'care solution')

df_all['product_category'] = df_all['product_category'].replace([
    'chiller,refrigerator',
    'refrigerator,chiller'
], 'chiller_refrigerator')

df_all['product_category'] = df_all['product_category'].replace([
    'pc',
    'laptop',
    'notebook'
], 'computer')

df_all['product_category'] = df_all['product_category'].replace([
    'oled signage',
    'video wall',
    'one:quick series',
    'lg one:quick',
    'commercial display',
    'interactive digital board',
    'standard',
    'uhd signage',
    'lg one:quick series',
    'aio | one quick',
    'hospital tv',
    'medical display',
    'one quick works',
    'fhd series',
    'onequick series',
    'led aio 136',
    '110 + video wall',
    'aio',
    'one:quick flex',
    'collaboration displays',
    'meeting & screen sharedirect view leddirect view led',
    'window facing display',
    'led',
    'medical displays',
    'videwall',
    'laec015',
    'laec015-gn.awz',
    '55vm5e-a',
    'laec15',
    'standalone',
    'ledallinone',
    '55svh7f-a',
    '98uh5e',
    'pro:centric'
], 'display')

df_all['product_category'] = df_all['product_category'].replace([
    'energy storage system',
    'ess'
], 'ess')

df_all['product_category'] = df_all['product_category'].replace([
    'teto ou cassete inverter',
    'multi inverter'
], 'inventer')

df_all['product_category'] = df_all['product_category'].replace(['monitor',
    'monitor signage,commercial tv,monior/monitor tv',
    'monitor signage,monior/monitor tv',
    'monitor signage,commercial tv,monior/monitor tv,projector,tv',
    'monitor signage,commercial tv,monior/monitor tv,tv',
    'commercial tv,tv',
    'monior/monitor tv,tv',
    '28mq780',
    'computer monitors',
    'medical monitors',
    'monior/monitor tv,chiller',
    'monitor signage,commercial tv',
    'monitor signage,solar',
    'solar,monior/monitor tv',
    'monior/monitor tv,pc',
    'monitor signage,tv',
    'monitor signage,commercial tv,audio/video',
    'monitor signage,monior/monitor tv,commercial tv',
    'commercial tv,monior/monitor tv',
    'radiology displays',
    'high inch 86 / 98 or 110',
    'information display,monitor',
    'taa lcd lfd displays',
    'monior/monitor tv,audio/video',
    'medical monitor',
    'monior/monitor tv,projector,audio/video',
    'monitor signage,mobile',
    'monitorindustrial_rmk',
    'surgical monitor'
], 'monitor')

df_all['product_category'] = df_all['product_category'].replace([
    'multi-split',
    'multi split',
    'multi-split (plusieurs pi챔ces)'
], 'multi_split')

df_all['product_category'] = df_all['product_category'].replace([
    'etc.',
    'others',
    'outros',
    'other',
    'lainnya',
    'others',
    'not specified'
], 'etc')

df_all['product_category'] = df_all['product_category'].replace([
    'multi v',
    'multi v 5 air',
    'multi v water 5'
], 'outdoor_unit')

df_all['product_category'] = df_all['product_category'].replace([
    'projector',
    'bu50nst'
], 'projector')

df_all['product_category'] = df_all['product_category'].replace([
    'standard signage',
    'high brightness signage',
    'interactive signage',
    'video wall signage',
    'led signage',
    'special signage',
    'ur640',
    'ur640s',
    'high brightness',
    'digital signage',
    'ultra stretch series',
    'one quick:flex',
    'videowall signage',
    'tv signage',
    'ultra stretch signage',
    'digital signage or commercial tvs',
    'gscd046',
    '43uh5f-h.awzm',
    '49vl5g-m.awzm',
    '49vl5g-m',
    '55vm5j-h',
    '49vl5f',
    'signage',
    'videowall_rmk',
    '86uh5f',
    '55tc3d',
    'video wall + aio',
    'one:quick',
    'gsca046',
    'gscd100',
    'lsca039'
], 'signage')

df_all['product_category'] = df_all['product_category'].replace([
    'single-split',
    'split tunggal',
    'single split'
], 'single_split')

df_all['product_category'] = df_all['product_category'].replace([
    'solar,energy storage system',
    'solar,ess'
], 'ess_solar')

df_all['product_category'] = df_all['product_category'].replace([
    'software solution',
    'lg customer care program'
], 'software_solution')

df_all['product_category'] = df_all['product_category'].replace([
    'solar,system ac',
    'solar,ess,system ac'
], 'solar_airconditioner')

df_all['product_category'] = df_all['product_category'].replace([
    'hotel tv',
    'commercial tv',
    'smart tv signage',
    'commercial tv,audio/video',
    '50uq801c0sb.bwz',
    'tv',
    '55us660h0sd.bwz',
    '32lq621cbsb.awz',
    '55uq801c0sb.bwz',
    '43uq751c0sf.bwz',
    'htv',
    '43uq751c0sb.bwz',
    '50us660h0sd.bwz',
    'tv 60"',
    '43us660h (na)',
    'pro centric hotel',
    'comercial tv',
    'smart tv',
    'tv 55"',
    '32 pol',
    'tv 43 pol',
    '43 pol'
], 'tv')

df_all['product_category'] = df_all['product_category'].replace([
    'pc,tv',
    'tv,pc'
], 'pc_tv')

df_all['product_category'] = df_all['product_category'].replace([
    'vrf',
    'all lg vrf systems',
    'multi v5 vrf',
    'vrf - multi v s',
    'klimatyzacja vrf'
], 'vrf')

In [165]:
value_counts = df_all['product_category'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['product_category'] = df_all['product_category'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['product_category'].fillna('etc', inplace = True)

In [166]:
print(len(df_all.product_category.unique()))
df_all.product_category.unique()

105


array(['multi_split', 'single_split', 'vrf', 'chiller', 'etc', 'rac',
       'inventer', 'software_solution', 'ventilation',
       'a thermodynamic water heater', 'air_conditioner', 'heating',
       'חימום', 'signage', 'tv', 'care solution', 'display', 'idb',
       'accessories', 'webos', 'monitor', 'computer', 'projector',
       'cloud device', 'washing machine,dryer', 'air_care',
       'monitor signage,commercial tv,solar,ess,monior/monitor tv,pc,projector,robot,system ac,ems,rac,chill',
       'ess_solar', 'solar_airconditioner', '互動式顯示屏', 'led 顯示屏', '標準顯示屏',
       '43us660h0sd.awz', '酒店電視', 'robots', 'oled 顯示屏', 'id',
       'outdoor_unit', 'control', 'מזגנים למקום מגורים', 'مبرد (تشيلر)',
       'ahu', 'isıtma', 'technical support', 'تكييفات', 'sales inquiry',
       'تكييف وتبريد', 'error', 'monitor & pc', 'commercial tv,projector',
       'system ac,aircare', 'monitor signage,audio/video',
       'monitor signage,commercial tv,solar,ess,monior/monitor tv,pc',
       'monit

In [138]:
# 전처리 필요한 4개 컬럼 간단하게
value_counts = df_all['customer_job'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_job'] = df_all['customer_job'].apply(lambda x: 'other' if x in values_to_replace else x)
df_all['customer_job'].fillna('other', inplace = True)

value_counts = df_all['expected_timeline'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['expected_timeline'] = df_all['expected_timeline'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['expected_timeline'].fillna('etc', inplace = True)

value_counts = df_all['customer_country'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_country'] = df_all['customer_country'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['customer_country'].fillna('etc', inplace = True)

In [139]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx + 1
    series = series.map(my_dict)

    return series

In [140]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [141]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [142]:
def get_clf_eval(y_test, y_pred=None):
    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['T[0]', 'F[1]'], columns = ['pred_T[0]', 'pred_F[1]'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [143]:
df_train.shape, df_test.shape

((59299, 20), (5271, 20))

In [144]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의 

In [145]:
model = DecisionTreeClassifier(random_state = 400)

### 모델 학습

In [146]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [147]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [148]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  756   191]
 [  233 10680]]

정확도: 0.9642
정밀도: 0.7644
재현율: 0.7983
F1: 0.7810
