In [None]:
# 라이브러리 임포트
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, RidgeCV, RidgeClassifier, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

from torch import Tensor
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Mounted at /content/drive


## Pre-processing Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/competition_1st/train.csv')

In [None]:
dcs_tree_clf = DecisionTreeClassifier(random_state=0)
dcs_tree_clf.fit(df.drop(columns=['depvar']), df['depvar'])
dcs_tree_clf.score(df.drop(columns=['depvar']), df['depvar'])

feature_imp = dcs_tree_clf.feature_importances_

for_del_idx = [i if (_<0.015) else None for i, _ in enumerate(feature_imp)]

extract_del_idx = list()
for i in for_del_idx:
  if i is not None:
    extract_del_idx.append(i)

col_list = list(df.columns)

extract_del_idx.reverse()

proc_col = list()
for i in extract_del_idx:
  del col_list[i]
  
proc_col = col_list

df2 = df[proc_col]

In [None]:
# Feature와 result값 나누기
X = df2.drop('depvar', axis=1)
y = df2['depvar']

# Train Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [None]:
df2.nunique()

int_rate                256
annual_inc             8905
dti                    4148
inq_last_6mths            7
revol_bal             36002
total_acc               107
tot_cur_bal           80613
funded_amnt            1355
funded_amnt_inv        1391
total_rec_late_fee     2752
open_acc                 62
installment           25708
revol_util             1101
total_rec_int         85095
fico_range_low           38
fico_range_high          38
depvar                    2
dtype: int64

In [None]:
X_valid.head()

Unnamed: 0,int_rate,annual_inc,dti,inq_last_6mths,revol_bal,total_acc,tot_cur_bal,funded_amnt,funded_amnt_inv,total_rec_late_fee,open_acc,installment,revol_util,total_rec_int,fico_range_low,fico_range_high
5654,0.1899,63504.0,21.69,0,6517,19,130875,11675,11675.0,0.0,9,427.91,0.179,969.99,675,679
49590,0.1757,45000.0,29.76,1,7969,28,97800,6925,6900.0,0.0,28,248.87,0.664,2020.52,670,674
86065,0.1269,25000.0,24.49,1,8476,17,13451,8000,7900.0,0.0,9,268.36,0.422,1655.26,675,679
74271,0.1099,100000.0,28.95,0,15908,37,246365,18000,18000.0,0.0,10,391.28,0.644,5365.28,715,719
20871,0.0789,24000.0,7.45,0,319,16,9974,9600,9600.0,0.0,9,300.35,0.01,1206.18,710,714


In [None]:
y_train.head()

45338    0
46685    0
57679    1
34902    0
5522     1
Name: depvar, dtype: int64

In [None]:
type(y_train)

pandas.core.series.Series

In [None]:
y_valid.head()

5654     0
49590    0
86065    0
74271    0
20871    0
Name: depvar, dtype: int64

In [None]:
y_type(y_valid)

pandas.core.series.Series

In [None]:
# 정규화 // 오히려 정규화하고 점수 더 안좋아지는 경향

X_train_sc = StandardScaler().fit_transform(X_train)
X_valid_sc = StandardScaler().fit_transform(X_valid)

X_train_sc[1]

array([ 0.31249451, -0.12897104, -0.49902377,  1.4031173 , -0.31304601,
        0.87430439, -0.5771987 ,  0.97822402,  0.97926856, -0.16423559,
        1.72567781,  1.21714745, -0.35371743, -0.28592391, -1.09822279,
       -1.09820525])

In [None]:
# Data 증축 전, 결과 값 비율
print("Before OverSampling, counts of label '0': {}".format(sum(y_train==0)))
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))

Before OverSampling, counts of label '0': 53930
Before OverSampling, counts of label '1': 26070


In [None]:
def summarize_classification_result(model, X, y):
    y_pred = model.predict(X)
    print(accuracy_score(y, y_pred))
    print(f1_score(y, y_pred))
    print(classification_report(y, y_pred))
    print(confusion_matrix(y, y_pred))
    # sns.heatmap(confusion_matrix(y, y_pred), annot=True)

In [None]:
# 평가 함수 정의
def get_clf_eval(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    AUC = roc_auc_score(y_actual, y_pred)
    F1 = f1_score(y_actual, y_pred)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('AUC: {:.4f}'.format(AUC))
    print('F1: {:.4f}'.format(F1))
    
    # sns.heatmap(confusion_matrix(y_actual, y_pred), annot=True, fmt='d', cmap='YlGnBu')

In [None]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


class ValidData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

train_data = TrainData(torch.FloatTensor(X_train_sc), torch.FloatTensor(y_train.to_numpy()))
valid_data = ValidData(torch.FloatTensor(X_valid_sc), torch.FloatTensor(y_valid.to_numpy()))
# y_train과 y_valid는 pandas.Series타입이라서 그대로 Tensor에 넣으면 에러발생함. numpy로 바꿔주고 진행

In [None]:
EPOCHS = 500
BATCH_SIZE = 1024
LEARNING_RATE = 0.001

# data loader 세팅
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=1)

In [None]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 16
        self.layer_1 = nn.Linear(16, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
# torch.cuda.device_count() #gpu 갯수
print(device) #cuda 만 있을 때 default로 0번 gpu 배정됨. cuda:0 이랑 같은 것

cuda


In [None]:
# 모델 객체 생성
model = BinaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

BinaryClassification(
  (layer_1): Linear(in_features=16, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
# Training
EPOCHS=2
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    if e % 100 == 1:
      print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.00000 | Acc: 50.114


In [None]:
# Validation 
y_pred_array = np.array([])
# y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch, _ in valid_loader:
        X_batch = X_batch.to(device)
        y_valid_pred = model(X_batch)
        y_valid_pred = torch.sigmoid(y_valid_pred)
        # y_pred_tag = torch.round(y_valid_pred)
        # y_pred_array.append(y_valid_pred.cpu().numpy())
        y_pred_array = np.append(y_pred_array, y_valid_pred.cpu().numpy())

# y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
model(X_batch)

tensor([[0.3155]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
len(y_pred_array)

20000

In [None]:
# np.append(y_pred_array, y_valid_pred.cpu().numpy())

array([0.73389482])

In [None]:
# np.append(y_pred_array, y_valid_pred.cpu().numpy())

array([0.73389482])

In [None]:
# y_valid_pred.cpu().numpy()

array([[0.7338948]], dtype=float32)

In [None]:
# y_pred_list[0].item()

0.695928692817688

In [None]:
get_clf_eval(y_valid, y_pred_array>0.3)


정확도: 0.7006
정밀도: 0.5271
재현율: 0.7652
AUC: 0.7174
F1: 0.6242


## Linear Regression()

In [None]:
reg_model = LinearRegression()

In [None]:
reg_model.fit(X_train_sc, y_train)

LinearRegression()

In [None]:
y_hat = reg_model.predict(X_valid_sc)

In [None]:
y_hat = y_hat>0.3

In [None]:
df_predictions = pd.DataFrame({'actuals':y_valid, 'predictions':y_hat, 'resid': y_valid - y_hat})
change = {True:1, False:0}
df_predictions.predictions = df_predictions.predictions.map(change)
df_predictions.head()

Unnamed: 0,actuals,predictions,resid
86605,1,0,1
67931,0,0,0
65523,0,0,0
21787,0,0,0
68815,1,1,0


In [None]:
acc_reg = df_predictions[df_predictions['resid']==0].shape[0] / df_predictions.shape[0] * 100 
acc_reg

64.795

In [None]:
reg_model.coef_

array([ 0.15701945, -0.00526872,  0.03308215,  0.01558603, -0.00580517,
       -0.02015316, -0.02757787,  1.45755628, -1.12049107,  0.06038941,
        0.02331513, -0.23135743, -0.01088862, -0.15222726, -2.28034324,
        2.24417722])

In [None]:
reg_model.coef_.shape

(16,)

In [None]:
reg_model.intercept_

0.32657500000000017

In [None]:
reg_summary = pd.DataFrame(data = X_train.columns, columns=['Features'])
reg_summary ['Coefficients'] = np.round(reg_model.coef_,4)
reg_summary

Unnamed: 0,Features,Coefficients
0,int_rate,0.157
1,annual_inc,-0.0053
2,dti,0.0331
3,inq_last_6mths,0.0156
4,revol_bal,-0.0058
5,total_acc,-0.0202
6,tot_cur_bal,-0.0276
7,funded_amnt,1.4576
8,funded_amnt_inv,-1.1205
9,total_rec_late_fee,0.0604


## Elastic Net

In [None]:
from sklearn.linear_model import Lasso,ElasticNet,Ridge

lasso = Lasso(alpha=0.001)
lasso.fit(X_train_sc, y_train)
pred = lasso.predict(X_valid_sc)

y_pred_array = pred
get_clf_eval(y_valid, y_pred_array>0.3)



정확도: 0.6466
정밀도: 0.4702
재현율: 0.7664
AUC: 0.6780
F1: 0.5829


In [None]:
y_pred_array

tensor([0.5641, 0.5334, 0.5771,  ..., 0.6337, 0.6442, 0.4832])

In [None]:
pred.shape

(20000,)

## Data 편향 제거 처리

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=27) 
X_train_smt, y_train_smt = sm.fit_resample(X_train, y_train)

In [None]:
print("After OverSampling, counts of label '0': {}".format(sum(y_train_smt==0)))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_smt==1)))

After OverSampling, counts of label '0': 53930
After OverSampling, counts of label '1': 53930


In [None]:
X_train_smt.head()

Unnamed: 0,int_rate,annual_inc,dti,inq_last_6mths,revol_bal,total_acc,tot_cur_bal,funded_amnt,funded_amnt_inv,total_rec_late_fee,open_acc,installment,revol_util,total_rec_int,fico_range_low,fico_range_high
0,0.0917,70000.0,17.5,0,4577,16,35757,8000,8000.0,0.0,5,255.04,0.87,392.34,660,664
1,0.1899,85000.0,23.11,1,45406,31,327652,23850,23850.0,0.0,18,874.13,0.929,6243.43,660,664
2,0.0916,65000.0,25.48,0,16100,16,48197,17000,17000.0,0.0,10,541.87,0.531,1196.65,690,694
3,0.0818,60000.0,24.26,0,15712,18,50411,12000,12000.0,0.0,12,377.04,0.79,1567.73,670,674
4,0.2099,48000.0,37.53,2,11984,11,33598,15000,15000.0,20.29,9,405.72,0.599,9265.08,665,669


In [None]:
type(y_train_smt)

pandas.core.series.Series

In [None]:
train_data_smt = TrainData(torch.FloatTensor(X_train_smt), torch.FloatTensor(y_train_smt.to_numpy()))
valid_data_smt = ValidData(torch.FloatTensor(X_valid), torch.FloatTensor(y_valid.to_numpy()))

ValueError: ignored

In [None]:
EPOCHS = 500
BATCH_SIZE = 1024
LEARNING_RATE = 0.001

# data loader 세팅
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=1)

In [None]:
# Training

model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    if e % 100 == 1:
      print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
# SMOTE - 오히려 더 감소하는 경향
# get_clf_eval(y_valid, stacking_clf_smt.predict_proba(X_valid)[:,1]>0.4) #0.6166
# get_clf_eval(y_valid, stacking_clf_smt.predict_proba(X_valid)[:,1]>0.35)  #0.6288
get_clf_eval(y_valid, stacking_clf_smt.predict_proba(X_valid)[:,1]>0.3)  #0.6299


정확도: 0.7006
정밀도: 0.5304
재현율: 0.7753
AUC: 0.7197
F1: 0.6299


## 제출

In [None]:
# 제출 양식 다운로드
submit = pd.read_csv('/content/drive/MyDrive/competition_1st/sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('/content/drive/MyDrive/competition_1st/test.csv')
submit['answer'] = stacking_clf.predict_proba(df_test.drop(columns=['ID']))[:,1]>0.35
change = {True:1, False:0}
submit.answer = submit.answer.map(change)
# 제출 파일 저장
submit.to_csv('/content/drive/MyDrive/competition_1st/submission_5.csv', index=False)

In [None]:
submit.head()

Unnamed: 0,ID,answer
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
