In [11]:
# required package import
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import random

### 데이터 불러오기

In [12]:
# load data
train_df = pd.read_csv('../data/preprocessed_data/train_data.csv')
valid_df = pd.read_csv('../data/preprocessed_data/valid_data.csv')

In [13]:
x_train = train_df.drop(columns = ['cust_no', 'label'])
x_valid = valid_df.drop(columns = ['cust_no', 'label'])
y_train = train_df['label']
y_valid = valid_df['label']

print("x_train.shape: ", x_train.shape)
print("x_valid.shape: ", x_valid.shape)
print("y_train.shape: ", y_train.shape)
print("y_valid.shape: ", y_valid.shape)

x_train.shape:  (125290, 83)
x_valid.shape:  (10000, 83)
y_train.shape:  (125290,)
y_valid.shape:  (10000,)


### I1 결측치 채우기


In [14]:
# data for train
tmp_train = x_train.dropna()
tmp_train_x = tmp_train.drop('I1', axis = 1)
tmp_train_y = tmp_train['I1']

tmp_valid = x_valid.dropna()
tmp_valid_x = tmp_valid.drop('I1', axis = 1)
tmp_valid_y = tmp_valid['I1']

# model training
# (train note)
# logistic regression : no convergence
# random forest(default) : train acc = 1.0 , valid acc = 0.65
# random forest(n_estimators = 100, max_depth = 15) : train acc = 0.77, valid acc = 0.63
# random forest(n_estimators = 500(default), max_depth = 15) : train acc = 0.77, valid acc = 0.64
start_T = time.time()
rf = RandomForestClassifier(max_depth = 15)
rf.fit(tmp_train_x, tmp_train_y)
end_T = time.time()

print("model training time : ", end_T - start_T)

# prediction and evaluation
# train data
tmp_train_pred_y = rf.predict(tmp_train_x)
print('Accuracy: {:.2f}'.format(accuracy_score(tmp_train_y, tmp_train_pred_y)))
print(confusion_matrix(tmp_train_y, tmp_train_pred_y))
print(classification_report(tmp_train_y, tmp_train_pred_y))

# valid data
tmp_valid_pred_y = rf.predict(tmp_valid_x)
print('Accuracy: {:.2f}'.format(accuracy_score(tmp_valid_y, tmp_valid_pred_y)))
print(confusion_matrix(tmp_valid_y, tmp_valid_pred_y))
print(classification_report(tmp_valid_y, tmp_valid_pred_y))

model training time :  38.150205850601196
Accuracy: 0.78
[[60601  7273]
 [20658 36704]]
              precision    recall  f1-score   support

         0.0       0.75      0.89      0.81     67874
         1.0       0.83      0.64      0.72     57362

    accuracy                           0.78    125236
   macro avg       0.79      0.77      0.77    125236
weighted avg       0.79      0.78      0.77    125236

Accuracy: 0.64
[[4169 1270]
 [2297 2261]]
              precision    recall  f1-score   support

         0.0       0.64      0.77      0.70      5439
         1.0       0.64      0.50      0.56      4558

    accuracy                           0.64      9997
   macro avg       0.64      0.63      0.63      9997
weighted avg       0.64      0.64      0.64      9997



In [15]:
# data having null data
x_train.loc[x_train.I1.isna(), 'I1'] = rf.predict(x_train[x_train.I1.isna()].drop('I1', axis = 1))
x_valid.loc[x_valid.I1.isna(), 'I1'] = rf.predict(x_valid[x_valid.I1.isna()].drop('I1', axis = 1))

In [16]:
print("train null data num : ", x_train.isna().sum().sum())
print("valid null data num : ", x_valid.isna().sum().sum())

train null data num :  0
valid null data num :  0


In [17]:
# tmp data save
x_train.to_csv('./tmp_data/x_train.csv', index = None)
x_valid.to_csv('./tmp_data/x_valid.csv', index = None)
y_train.to_csv('./tmp_data/y_train.csv', index = None)
y_valid.to_csv('./tmp_data/y_valid.csv', index = None)

### modeling

In [2]:
x_train = pd.read_csv('./tmp_data/x_train.csv')
x_valid = pd.read_csv('./tmp_data/x_valid.csv')
y_train = pd.read_csv('./tmp_data/y_train.csv')
y_valid = pd.read_csv('./tmp_data/y_valid.csv')

### SVC modeling

- time complexity : O(n_sample^2 * feature_dim)

#### 1. 기본 데이터

- sampling을 100개를 해서 돌려도 안 돌아가서 차원축소 후에 돌려보려고 함.
- 일단 DNN하고 다시 돌아오는 걸로.

In [6]:
# DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y_train = np.array(y_train).ravel()
y_valid = np.array(y_valid).ravel()

In [7]:
# check data shape
print("x_train.shape: ", x_train.shape)
print("x_valid.shape: ", x_valid.shape)
print("y_train.shape: ", y_train.shape)
print("y_valid.shape: ", y_valid.shape)

x_train.shape:  (125290, 83)
x_valid.shape:  (10000, 83)
y_train.shape:  (125290,)
y_valid.shape:  (10000,)


In [15]:
random.seed(2023)
tmp_idx = np.random.choice(np.arange(0,x_train.shape[0]), 100, replace = False)
tmp_x_train = x_train.iloc[tmp_idx]
tmp_y_train = y_train[tmp_idx]
print("tmp_x_train: ", tmp_x_train.shape)
print("tmp_y_train: ", tmp_y_train.shape)

tmp_x_train:  (100, 83)
tmp_y_train:  (100,)


In [None]:
# model training
# (training note)
# no sampling : 안 돌아감.
# sampling_num = 100 , kernel = linear : time : 안 돌아감.
# sampling_num = 100 , kernel = rbf : time : 안 돌아가겠지..
start_T = time.time()
svc = SVC()
svc.fit(tmp_x_train, tmp_y_train)
end_T = time.time()
print("model training time: ", end_T - start_T)

# prediction and evaluation
# train data
train_pred_y = rf.predict(tmp_x_train)
print('Accuracy: {:.2f}'.format(accuracy_score(y_train, train_pred_y)))
print(confusion_matrix(y_train, train_pred_y))
print(classification_report(y_train, train_pred_y))

# valid data
valid_pred_y = rf.predict(x_valid)
print('Accuracy: {:.2f}'.format(accuracy_score(y_valid, valid_pred_y)))
print(confusion_matrix(y_valid, valid_pred_y))
print(classification_report(y_valid, valid_pred_y))


"### DNN modeling

#### 1. 기본 데이터