## Imports

In [1]:
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 15})
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
# Import custom classes
%run -i "../src/helper/01_transfxn.py"
%run -i "../src/helper/02_ml.py"

## Load the data

In [3]:
# Load the dataset

file_path = '../data/clean_data.csv'
df = pd.read_csv(file_path)

print('Data size',df.shape)
df.head()

Data size (40000, 14)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,class
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  class      40000 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 4.3+ MB


## Class distribution

In [5]:
label_pct = df['class'].value_counts(normalize = True)*100
label_ct =  df['class'].value_counts()
pd.DataFrame(
    {
        'labels': label_pct.index, 
        'count': label_ct.values, 
        'percentage': label_pct.values,
    }
)

Unnamed: 0,labels,count,percentage
0,0,37104,92.76
1,1,2896,7.24


## Descriptive statistics

In [6]:
df.drop('class', axis =1).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,40000.0,40.5446,9.641776,19.0,33.0,39.0,48.0,95.0
balance,40000.0,1274.27755,2903.769716,-8019.0,54.0,407.0,1319.0,102127.0
day,40000.0,16.017225,8.278127,1.0,8.0,17.0,21.0,31.0
duration,40000.0,254.8243,259.366498,0.0,100.0,175.0,313.0,4918.0
campaign,40000.0,2.882175,3.239051,1.0,1.0,2.0,3.0,63.0


In [7]:
# Split feature and target vectors
X = df.drop("class", 1)
y = df["class"]

print('Data size:', X.shape, y.shape)

Data size: (40000, 13) (40000,)


## Create test and train set

Split the data set into 80% training and 20% test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, 
                                                    stratify=y)

In [9]:
print(f'Training set size: {X_train.shape, y_train.shape}')
print(f'Test set size: {X_test.shape, y_test.shape}')

Training set size: ((32000, 13), (32000,))
Test set size: ((8000, 13), (8000,))


## Class distribution in training and test sets

In [10]:
print(f'Training set class distribution:\n {(y_train.value_counts()/X_train.shape[0])*100}')
print('--' * 15)
print(f'Test set class distribution:\n {(y_test.value_counts()/X_test.shape[0])*100}')

Training set class distribution:
 0    92.759375
1     7.240625
Name: class, dtype: float64
------------------------------
Test set class distribution:
 0    92.7625
1     7.2375
Name: class, dtype: float64


## Preprocessing

Data preprocessing involves the following steps.

- Numerical Predictors

    - Impute the missing values with the median
    - Standardize and scale the predictors

- Categorical Predictors

    - Impute the missing values with a constant "NA"
    - One-hot encode the predictors

In [11]:
# Preprocessing
X_train_scaled, X_test_scaled, feat_names = transfxn.preprocessing(X_train, X_test)


NameError: name 'transfxn' is not defined