In [9]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv(r'C:\GitHub_Projects\ML_projects\Logistic_Regression\Data\bank-full.csv', sep = ';')
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [7]:
# It is better to check about nan values and na values
df.isnull().sum()
df.isna().sum()
# In our case, this data there is no null or na values

# identify the features and target
X = df.drop('y', axis = 1)
y = df['y'].map({'yes': 1,'no': 0})

# separate the data into categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print('categorical_cols:', categorical_cols); print('numerical_cols:', numerical_cols)

categorical_cols: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numerical_cols: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [14]:
# Preprocessing
# StandardScaler normalizes the numerical values in numerical columns, OneHotEncoder converts categorical columns into indicator or numerical values
# ColumnTransformer also concate columns together
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
])

# Pipeline
clf = Pipeline([
    ('preprocess', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [15]:
# split the data into train and test parts : 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [19]:
# fit the model into data
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [18]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[11694   273]
 [ 1015   582]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     11967
           1       0.68      0.36      0.47      1597

    accuracy                           0.91     13564
   macro avg       0.80      0.67      0.71     13564
weighted avg       0.89      0.91      0.89     13564

