In [33]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [25]:
# load data
train_df = pd.read_csv('../data/preprocessed_data/train_data.csv')
valid_df = pd.read_csv('../data/preprocessed_data/valid_data.csv')
test_df = pd.read_csv('../data/preprocessed_data/test_data.csv')

In [26]:
x_train = train_df.drop(columns = ['cust_no', 'label'])
y_train = train_df['label']
x_valid = valid_df.drop(columns = ['cust_no', 'label'])
y_valid = valid_df['label']
x_test = test_df.drop(columns = ['cust_no', 'label'])
y_test = test_df['label']

In [27]:
print('x_train.shape: ', x_train.shape)
print("y_train.shape: ", y_train.shape)
print('x_valid.shape: ', x_valid.shape)
print("y_valid.shape: ", y_valid.shape)
print('x_test.shape: ', x_test.shape)
print("y_test.shape: ", y_test.shape)

x_train.shape:  (125290, 83)
y_train.shape:  (125290,)
x_valid.shape:  (10000, 83)
y_valid.shape:  (10000,)
x_test.shape:  (10000, 83)
y_test.shape:  (10000,)


### I1 결측치 채우기

In [35]:
# data for train
tmp_train = x_train.dropna()
tmp_train_x = tmp_train.drop('I1', axis = 1)
tmp_train_y = tmp_train['I1']

tmp_valid = x_valid.dropna()
tmp_valid_x = tmp_valid.drop('I1', axis = 1)
tmp_valid_y = tmp_valid['I1']

# model training
# (train note)
# logistic regression : no convergence
# random forest(default) : train acc = 1.0 , valid acc = 0.65
# random forest(n_estimators = 100, max_depth = 15) : train acc = 0.77, valid acc = 0.63
# random forest(n_estimators = 500(default), max_depth = 15) : train acc = 0.77, valid acc = 0.64
start_T = time.time()
rf = RandomForestClassifier(max_depth = 15)
rf.fit(tmp_train_x, tmp_train_y)
end_T = time.time()

print("model training time : ", end_T - start_T)

# prediction and evaluation
# train data
tmp_train_pred_y = rf.predict(tmp_train_x)
print('Accuracy: {:.2f}'.format(accuracy_score(tmp_train_y, tmp_train_pred_y)))
print('f1-score: {:.2f}'.format(f1_score(tmp_train_y, tmp_train_pred_y)))

# valid data
tmp_valid_pred_y = rf.predict(tmp_valid_x)
print('Accuracy: {:.2f}'.format(accuracy_score(tmp_valid_y, tmp_valid_pred_y)))
print('f1-score: {:.2f}'.format(f1_score(tmp_valid_y, tmp_valid_pred_y)))

model training time :  36.02245092391968
Accuracy: 0.78
f1-score: 0.72
Accuracy: 0.64
f1-score: 0.56


In [36]:
# data having null data
x_train.loc[x_train.I1.isna(), 'I1'] = rf.predict(x_train[x_train.I1.isna()].drop('I1', axis = 1))
x_valid.loc[x_valid.I1.isna(), 'I1'] = rf.predict(x_valid[x_valid.I1.isna()].drop('I1', axis = 1))
x_test.loc[x_test.I1.isna(), 'I1']   = rf.predict(x_test[x_test.I1.isna()].drop('I1', axis = 1))

In [37]:
print("train null data num : ", x_train.isna().sum().sum())
print("valid null data num : ", x_valid.isna().sum().sum())
print("test null data num : ", x_test.isna().sum().sum())

train null data num :  0
valid null data num :  0
test null data num :  0


### label = 0 데이터 제거

In [43]:
x_train = x_train[~(y_train == 0)]
y_train = y_train[~(y_train == 0)]
x_valid = x_valid[~(y_valid == 0)]
y_valid = y_valid[~(y_valid == 0)]
x_test = x_test[~(y_test == 0)]
y_test = y_test[~(y_test == 0)]

In [45]:
x_train.to_csv('../data/fill_data/x_train.csv', index = False)
y_train.to_csv('../data/fill_data/y_train.csv', index = False)
x_valid.to_csv('../data/fill_data/x_valid.csv', index = False)
y_valid.to_csv('../data/fill_data/y_valid.csv', index = False)
x_test.to_csv('../data/fill_data/x_test.csv', index = False)
y_test.to_csv('../data/fill_data/y_test.csv', index = False)

### train+valid

In [49]:
train_full = pd.concat([x_train, y_train], axis = 1)
valid_full = pd.concat([x_valid, y_valid], axis = 1)
full_data = pd.concat([train_full, valid_full], axis = 0)

In [50]:
full_data.to_csv('../data/fill_data/full_data.csv')