# 영업 성공 여부 분류 경진대회

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. 데이터 확인

### 필수 라이브러리

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

!pip install imbalanced-learn
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE



### 데이터 셋 읽어오기

In [3]:
df_train = pd.read_csv("/content/drive/MyDrive/train.csv") # 학습용 데이터
df_test = pd.read_csv("/content/drive/MyDrive/submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

### 레이블 인코딩

In [5]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [6]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [7]:
for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [8]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [9]:
# KNNImputer 객체 생성
imputer = KNNImputer(n_neighbors=5)

# 학습 데이터에 대해 결측치 처리
x_train_imputed = imputer.fit_transform(x_train)

# 검증 데이터에 대해 결측치 처리
x_val_imputed = imputer.transform(x_val)

# 결측치 처리 후 데이터를 DataFrame으로 변환
x_train = pd.DataFrame(x_train_imputed, columns=x_train.columns)
x_val = pd.DataFrame(x_val_imputed, columns=x_val.columns)

# SMOTE 객체 생성
smote = SMOTE(random_state=0)

# 오버샘플링 진행
x_train, y_train = smote.fit_resample(x_train, y_train)

# KNNImputer 객체 생성
imputer = KNNImputer(n_neighbors=5)

# 학습 데이터에 대해 결측치 처리
x_train_imputed = imputer.fit_transform(x_train)

# 검증 데이터에 대해 결측치 처리
x_val_imputed = imputer.transform(x_val)

# 테스트 데이터에 대해 결측치 처리
x_test_imputed = imputer.transform(x_test)

# 결측치 처리 후 데이터를 DataFrame으로 변환
x_train = pd.DataFrame(x_train_imputed, columns=x_train.columns)
x_val = pd.DataFrame(x_val_imputed, columns=x_val.columns)
x_test = pd.DataFrame(x_test_imputed, columns=x_test.columns)

# 연속형 변수들만 선택
continuous_columns = [col for col in x_train.columns if col not in label_columns]

# 표준화를 위한 StandardScaler 객체 생성
scaler = StandardScaler()

# 학습 데이터에 대해 표준화 적용
x_train[continuous_columns] = scaler.fit_transform(x_train[continuous_columns])

# 검증 데이터에 대해 표준화 적용
x_val[continuous_columns] = scaler.transform(x_val[continuous_columns])

# 테스트 데이터에 대해 표준화 적용
x_test[continuous_columns] = scaler.transform(x_test[continuous_columns])

## 3. 모델 학습

### 모델 정의

In [10]:
!pip install catboost
!pip install lightgbm

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

# CatBoost 모델 생성
catboost_model = CatBoostClassifier(eval_metric='Logloss')

# 하이퍼파라미터 범위 정의
param_dist = {
    'iterations': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [1, 3, 5],
    'rsm': [0.6, 0.8, 1.0]  # colsample_bylevel 대신 rsm을 사용합니다.
}

# RandomizedSearchCV 객체 생성
catboost_random_search = RandomizedSearchCV(
    catboost_model,
    param_dist,
    cv=5,
    scoring='f1',
    verbose=2,
    n_jobs=-1,
    n_iter=100
)

catboost_random_search.fit(x_train.fillna(0), y_train)

# RandomForestClassifier 모델 생성
random_forest_model = RandomForestClassifier()

# LightGBMClassifier 모델 생성
lgbm_model = LGBMClassifier()

# VotingClassifier를 사용하여 앙상블 모델 초기화
model = VotingClassifier(
    estimators=[('catboost', catboost_random_search.best_estimator_), ('random_forest', random_forest_model), ('lgbm', lgbm_model)],
    voting='soft'
)


Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2
Fitting 5 folds for each of 100 candidates, totalling 500 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


0:	learn: 0.6181103	total: 94.4ms	remaining: 47.1s
1:	learn: 0.5514070	total: 137ms	remaining: 34s
2:	learn: 0.4930887	total: 177ms	remaining: 29.4s
3:	learn: 0.4395048	total: 219ms	remaining: 27.1s
4:	learn: 0.4051976	total: 259ms	remaining: 25.6s
5:	learn: 0.3716438	total: 302ms	remaining: 24.9s
6:	learn: 0.3450447	total: 343ms	remaining: 24.2s
7:	learn: 0.3277710	total: 383ms	remaining: 23.6s
8:	learn: 0.3067152	total: 423ms	remaining: 23.1s
9:	learn: 0.2892618	total: 467ms	remaining: 22.9s
10:	learn: 0.2776187	total: 510ms	remaining: 22.7s
11:	learn: 0.2699813	total: 550ms	remaining: 22.4s
12:	learn: 0.2617559	total: 603ms	remaining: 22.6s
13:	learn: 0.2532735	total: 644ms	remaining: 22.4s
14:	learn: 0.2428789	total: 686ms	remaining: 22.2s
15:	learn: 0.2365889	total: 736ms	remaining: 22.3s
16:	learn: 0.2296400	total: 776ms	remaining: 22.1s
17:	learn: 0.2221527	total: 816ms	remaining: 21.8s
18:	learn: 0.2170968	total: 854ms	remaining: 21.6s
19:	learn: 0.2131384	total: 895ms	remainin

### 모델 학습

In [11]:
model.fit(x_train.fillna(0), y_train)

# 최적의 임계값 찾기
thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold = 0.5
best_score = 0
for threshold in thresholds:
    pred_proba = model.predict_proba(x_val)
    binary_pred = [output[1] > threshold for output in pred_proba]
    score = f1_score(y_val, binary_pred)
    if score > best_score:
        best_score = score
        best_threshold = threshold

# 최적의 임계값 적용
pred_proba = model.predict_proba(x_val)
pred = [output[1] > best_threshold for output in pred_proba]

0:	learn: 0.6181103	total: 44.5ms	remaining: 22.2s
1:	learn: 0.5514070	total: 85.3ms	remaining: 21.2s
2:	learn: 0.4930887	total: 128ms	remaining: 21.1s
3:	learn: 0.4395048	total: 184ms	remaining: 22.8s
4:	learn: 0.4051976	total: 222ms	remaining: 22s
5:	learn: 0.3716438	total: 264ms	remaining: 21.7s
6:	learn: 0.3450447	total: 305ms	remaining: 21.5s
7:	learn: 0.3277710	total: 346ms	remaining: 21.3s
8:	learn: 0.3067152	total: 385ms	remaining: 21s
9:	learn: 0.2892618	total: 426ms	remaining: 20.9s
10:	learn: 0.2776187	total: 473ms	remaining: 21s
11:	learn: 0.2699813	total: 512ms	remaining: 20.8s
12:	learn: 0.2617559	total: 552ms	remaining: 20.7s
13:	learn: 0.2532735	total: 602ms	remaining: 20.9s
14:	learn: 0.2428789	total: 645ms	remaining: 20.9s
15:	learn: 0.2365889	total: 692ms	remaining: 20.9s
16:	learn: 0.2296400	total: 734ms	remaining: 20.8s
17:	learn: 0.2221527	total: 773ms	remaining: 20.7s
18:	learn: 0.2170968	total: 811ms	remaining: 20.5s
19:	learn: 0.2131384	total: 871ms	remaining: 

### 모델 성능 보기

In [12]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [13]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  676   271]
 [   70 10843]]

정확도: 0.9712
정밀도: 0.9062
재현율: 0.7138
F1: 0.7986


## 4. 제출하기

### 테스트 데이터 예측

In [14]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [15]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

2874

### 제출 파일 작성

In [16]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("/content/drive/MyDrive/submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("/content/drive/MyDrive/submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**