In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torch.nn import ModuleList
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 데이터 불러오기

In [2]:
df = pd.read_csv("bank-direct-marketing-campaigns.csv")
print('data frame shape :', df.shape)
df.head()

data frame shape : (41188, 20)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  campaign        41188 non-null  int64  
 11  pdays           41188 non-null  int64  
 12  previous        41188 non-null  int64  
 13  poutcome        41188 non-null  object 
 14  emp.var.rate    41188 non-null  float64
 15  cons.price.idx  41188 non-null  float64
 16  cons.conf.idx   41188 non-null  float64
 17  euribor3m       41188 non-null 

# 데이터 전처리

In [4]:
# 중복 제거
print('data frame shape :', df.shape)

df = df.drop_duplicates(keep='last')
print('data frame shape :', df.shape)

data frame shape : (41188, 20)
data frame shape : (39404, 20)


In [5]:
# binary variable 처리 
df = df.replace({'y':{'no':0, 'yes':1}})
df = df.replace({'contact':{'telephone':0, 'cellular':1}})
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,0,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,0,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,0,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,0,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
5,45,services,married,basic.9y,unknown,no,no,0,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [6]:
# One-Hot Encoding
df = pd.get_dummies(df, drop_first = True)
df.head()

Unnamed: 0,age,contact,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,0,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
1,57,0,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
3,40,0,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
4,56,0,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
5,45,0,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0


In [7]:
df['y'].value_counts()

0    34806
1     4598
Name: y, dtype: int64

In [8]:
# train dataset, test dataset 으로 분리 + oversampling

X = df.drop('y', axis=1)
y = df['y']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

X_resampled['y'] = y_resampled
dataset = X_resampled

majority_dataset = df[df['y']==0].sample(df['y'].value_counts()[1])
minority_dataset = df[df['y']==1]

dataset = dataset[~dataset.isin(majority_dataset)].dropna()
dataset = dataset[~dataset.isin(minority_dataset)].dropna()

training_dataset = dataset
test_dataset = pd.concat([majority_dataset, minority_dataset])

X_train = training_dataset.drop(['y'],axis=1)
y_train = training_dataset[['y']]
X_test = test_dataset.drop(['y'],axis=1)
y_test = test_dataset[['y']]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60416, 52) (60416, 1) (9196, 52) (9196, 1)


# 데이터 로더

In [9]:
# train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = trainData(torch.FloatTensor(X_train.values)
                       , torch.tensor(y_train.values))


# test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = trainData(torch.FloatTensor(X_test.values)
                       , torch.tensor(y_test.values))


In [10]:
batch_size = 128

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size)

In [11]:
train_loader.dataset.X_data

tensor([[56.,  0.,  1.,  ...,  0.,  1.,  0.],
        [57.,  0.,  1.,  ...,  0.,  1.,  0.],
        [40.,  0.,  1.,  ...,  0.,  1.,  0.],
        ...,
        [55.,  1.,  7.,  ...,  0.,  1.,  0.],
        [26.,  1.,  1.,  ...,  0.,  0.,  1.],
        [72.,  1.,  1.,  ...,  0.,  1.,  0.]])

In [12]:
train_loader.dataset.y_data

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]], dtype=torch.float64)

# DNN

In [13]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        
        self.in_dim = X_train.shape[1]
        self.out_dim = 1
        
        self.fc1 = nn.Linear(self.in_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, self.out_dim)

        self.relu = nn.ReLU()
        
    def forward(self, x):
        a1 = self.relu(self.fc1(x))
        a2 = self.relu(self.fc2(a1))
        a3 = self.relu(self.fc3(a2))
        a4 = self.relu(self.fc4(a3))
        logit = self.fc5(a4)
        
        return logit

In [14]:
model = BinaryClassification()

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

model.parameters

<bound method Module.parameters of BinaryClassification(
  (fc1): Linear(in_features=52, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=32, bias=True)
  (fc5): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)>

In [15]:
def binary_acc(y_pred, y_test):
    y_pred_class = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_class == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [16]:
epochs = 200
for epoch in range(1, epochs+1):
    model.train() # 모델을 학습 모드로 변환
    epoch_loss = 0
    epoch_acc = 0
    
    for data in train_loader:
        
        # input shape : (batch_size x 45)
        inputs, labels = data
        
        optimizer.zero_grad() # gradient를 0으로 초기화
        outputs = model(inputs.float())
        
        loss = criterion(outputs, labels.float())
        acc = binary_acc(outputs, labels.float())
        
        loss.backward() # loss 역전파
        optimizer.step() # 가중치 update
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    if epoch % 10 == 0:
        print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
print('Finished Training')


Epoch 010: | Loss: 0.34291 | Acc: 85.229
Epoch 020: | Loss: 0.32583 | Acc: 86.051
Epoch 030: | Loss: 0.31622 | Acc: 86.591
Epoch 040: | Loss: 0.31182 | Acc: 86.737
Epoch 050: | Loss: 0.31232 | Acc: 86.864
Epoch 060: | Loss: 0.30684 | Acc: 86.970
Epoch 070: | Loss: 0.30912 | Acc: 86.797
Epoch 080: | Loss: 0.30398 | Acc: 87.108
Epoch 090: | Loss: 0.30334 | Acc: 87.265
Epoch 100: | Loss: 0.29955 | Acc: 87.468
Epoch 110: | Loss: 0.29751 | Acc: 87.373
Epoch 120: | Loss: 0.29963 | Acc: 87.369
Epoch 130: | Loss: 0.29725 | Acc: 87.403
Epoch 140: | Loss: 0.29560 | Acc: 87.536
Epoch 150: | Loss: 0.29552 | Acc: 87.587
Epoch 160: | Loss: 0.29589 | Acc: 87.456
Epoch 170: | Loss: 0.29249 | Acc: 87.600
Epoch 180: | Loss: 0.29719 | Acc: 87.375
Epoch 190: | Loss: 0.29000 | Acc: 87.725
Epoch 200: | Loss: 0.29096 | Acc: 87.661
Finished Training


In [18]:
# 모델 평가 
model.eval()

test_y_pred = torch.empty(0)
# print(test_loader.dataset.y_data.shape)

with torch.no_grad():    
    for data in test_loader: 

        inputs, labels = data
        
        outputs = model(inputs)
        outputs = torch.sigmoid(outputs)
        outputs = torch.round(outputs)
        outputs = outputs.squeeze(1)
        test_y_pred = torch.cat((test_y_pred, outputs))
print(classification_report(test_loader.dataset.y_data, test_y_pred, target_names=['no', 'yes']))

              precision    recall  f1-score   support

          no       0.65      0.75      0.70      4598
         yes       0.70      0.59      0.64      4598

    accuracy                           0.67      9196
   macro avg       0.68      0.67      0.67      9196
weighted avg       0.68      0.67      0.67      9196

