In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('clean.parquet')

### One-Hot Encoding for Categorical Values

In [3]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True) 
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [4]:
df = one_hot_encode(df, 'Marital_Status')

### Ordinal Encoding for Education Status

In [5]:
ranking_order = {'Basic': 1, 'Graduation': 2, '2n Cycle': 3, 'Master': 4, 'PhD': 5}
df['Education'] = df['Education'].map(ranking_order)

### Splitting Output from Input Space

In [6]:
X = df.drop('Response', axis= 1) 
y = df['Response']

### Splitting Training and Test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Modelling

In [8]:
#Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [9]:
#PolynomialFeatures 
poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)
X_train = poly.fit_transform(X_train)

In [10]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
preds = clf.predict(poly.transform(sc.transform(X_test)))

In [11]:
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
auc = roc_auc_score(y_test, preds)

In [12]:
print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8517
Precision: 0.4737
Recall: 0.2812
F1: 0.3529
AUC: 0.6144
