In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import sklearn.metrics as mt
from sklearn.model_selection import cross_val_score, cross_validate
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
print("TensorFlow has access to the following devices:", tf.config.list_physical_devices())

TensorFlow has access to the following devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# 0. 데이터 셋

In [3]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [5]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [6]:
# 학습 데이터와 제출 데이터 다시 분리
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [7]:
# is_converted를 1과0으로 바꾸기
def convert_is_converted(is_converted):
    if is_converted :
        return 1
    else:
        return 0

df_train["is_converted"] = list(map(convert_is_converted, df_train["is_converted"]))
df_test["is_converted"] = list(map(convert_is_converted, df_test["is_converted"]))

In [8]:
# NaN값을 0으로 대체하기
df_train['id_strategic_ver'] = df_train['id_strategic_ver'].fillna(0)
df_train['it_strategic_ver'] = df_train['it_strategic_ver'].fillna(0)
df_train['idit_strategic_ver'] = df_train['idit_strategic_ver'].fillna(0)

In [9]:
# NaN값을 0으로 대체하기
df_test['id_strategic_ver'] = df_test['id_strategic_ver'].fillna(0)
df_test['it_strategic_ver'] = df_test['it_strategic_ver'].fillna(0)
df_test['idit_strategic_ver'] = df_test['idit_strategic_ver'].fillna(0)

In [10]:
# 그냥 일단 전부 결측치를 0으로 채워보자
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

In [11]:
df_train.info()
df_train.describe()
target = df_train["is_converted"]
target.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         59299 non-null  int64  
 2   business_unit            59299 non-null  int64  
 3   com_reg_ver_win_rate     59299 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            59299 non-null  int64  
 6   enterprise               59299 non-null  int64  
 7   historical_existing_cnt  59299 non-null  float64
 8   id_strategic_ver         59299 non-null  float64
 9   it_strategic_ver         59299 non-null  float64
 10  idit_strategic_ver       59299 non-null  float64
 11  customer_job             59299 non-null  int64  
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             59299 non-null  int64  
 14  product_category      

is_converted
0    54449
1     4850
Name: count, dtype: int64

In [12]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [13]:
y_train.head()

43573    0
25857    0
42264    0
7665     0
23088    0
Name: is_converted, dtype: int64

In [14]:
# df_train/test(X값)에 대해서 표준화를 반드시 할 것!!
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(df_train)

df_train = std_scaler.transform(df_train)

# 1. SVM 선형 분리 학습

In [15]:
# SVM, kernel = 'linear'로 선형분리 진행

svm_clf = svm.SVC(kernel = 'linear')

# 교차검증

scores = cross_val_score(svm_clf, x_train, y_train, cv = 5)

scores

pd.DataFrame(cross_validate(svm_clf, x_train, y_train, cv = 5))

print('교차검증 평균 : ', scores.mean())