## SMOTE practice with bank dataset

#### import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

#### Load data and feature engineering

In [2]:
%cd c:/users/60223/desktop/SMOTE
%pwd

c:\users\60223\desktop\SMOTE


'c:\\users\\60223\\desktop\\SMOTE'

In [3]:
bank = pd.read_csv("data/bank-full.csv", sep=";", na_values="unknown")
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [4]:
for column in bank:
    print(column,': ',bank[column].unique())

age :  [58 44 33 47 35 28 42 43 41 29 53 57 51 45 60 56 32 25 40 39 52 46 36 49
 59 37 50 54 55 48 24 38 31 30 27 34 23 26 61 22 21 20 66 62 83 75 67 70
 65 68 64 69 72 71 19 76 85 63 90 82 73 74 78 80 94 79 77 86 95 81 18 89
 84 87 92 93 88]
job :  ['management' 'technician' 'entrepreneur' 'blue-collar' nan 'retired'
 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid' 'student']
marital :  ['married' 'single' 'divorced']
education :  ['tertiary' 'secondary' nan 'primary']
default :  ['no' 'yes']
balance :  [ 2143    29     2 ...  8205 14204 16353]
housing :  ['yes' 'no']
loan :  ['no' 'yes']
contact :  [nan 'cellular' 'telephone']
day :  [ 5  6  7  8  9 12 13 14 15 16 19 20 21 23 26 27 28 29 30  2  3  4 11 17
 18 24 25  1 10 22 31]
month :  ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']
duration :  [ 261  151   76 ... 1298 1246 1556]
campaign :  [ 1  2  3  5  4  6  7  8  9 10 11 12 13 19 14 24 16 32 18 22 15 17 25 21
 43 51 63 41 26 28 55 50 38 23 

In [5]:
bank["default"] = bank["default"].map({"no": 0, "yes": 1})
bank["housing"] = bank["housing"].map({"no": 0, "yes": 1})
bank["loan"] = bank["loan"].map({"no": 0, "yes": 1})
bank["y"] = bank["y"].map({"no": 0, "yes": 1})
bank.education = bank.education.map(
    {"primary": 0, "secondary": 1, "tertiary": 2})
bank.month = pd.to_datetime(bank.month, format="%b").dt.month

In [6]:
bank.isnull().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64

In [7]:
bank.drop(["poutcome", "contact"], axis=1, inplace=True)
bank.dropna(inplace=True)
bank = pd.get_dummies(bank, drop_first=True)

In [8]:
bank.y.value_counts()

0    38172
1     5021
Name: y, dtype: int64

In [9]:
X = bank.drop("y", axis=1)
y = bank.y

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)
y_train.value_counts()
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)



In [11]:
confusion_matrix(y_test, y_pred)

array([[9371,  173],
       [ 982,  273]], dtype=int64)

In [12]:
accuracy_score(y_test, y_pred)

0.8930456523752199

In [13]:
recall_score(y_test, y_pred)

0.21752988047808766

#### SMOKE

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)

In [15]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
np.bincount(y_train)

array([28628, 28628], dtype=int64)

In [16]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)



In [17]:
confusion_matrix(y_test, y_pred)

array([[7665, 1879],
       [ 245, 1010]], dtype=int64)

In [18]:
accuracy_score(y_test, y_pred)

0.8033151217705343

#### MearMiss

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)

In [20]:
nr = NearMiss()
X_train, y_train = nr.fit_sample(X_train, y_train)
np.bincount(y_train)


array([3766, 3766], dtype=int64)

In [22]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)



In [23]:
confusion_matrix(y_test, y_pred)

array([[5102, 4442],
       [ 162, 1093]], dtype=int64)

In [24]:
accuracy_score(y_test, y_pred)

0.573664228169275

In [25]:
recall_score(y_test, y_pred)

0.8709163346613545