In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('clean.parquet')

In [3]:
df.sample(10)

Unnamed: 0,Education,Marital_Status,Income,Recency,NumDealsPurchases,Response,Complain,MntSpent,NumPurchases,Age,DaysEnrolled
751,PhD,Married,50150.0,32,2,0,0,273,14,41,3898
162,Graduation,Together,29672.0,6,1,0,0,13,3,59,3732
364,Graduation,Married,18929.0,15,1,0,0,40,5,34,4022
1949,PhD,Married,51012.0,86,1,0,0,174,9,78,3961
2190,Graduation,Divorced,69932.0,97,2,0,0,737,24,60,3729
1632,PhD,Married,56242.0,72,5,0,0,864,23,55,4007
1601,Master,Divorced,61787.0,71,1,0,0,1108,18,37,3911
538,Basic,Married,30801.0,23,2,0,0,17,4,49,4135
1324,Master,Together,65176.0,57,4,0,0,1171,21,55,4132
15,Graduation,Married,54809.0,0,4,0,0,126,8,47,3756


### One-Hot Encoding for Categorical Values

In [4]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True) 
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [5]:
df = one_hot_encode(df, 'Marital_Status')

### Ordinal Encoding for Education Status

In [6]:
ranking_order = {'Basic': 1, 'Graduation': 2, '2n Cycle': 3, 'Master': 4, 'PhD': 5}
df['Education'] = df['Education'].map(ranking_order)

In [7]:
df.dtypes

Education              int64
Income               float64
Recency                int64
NumDealsPurchases      int64
Response               int64
Complain               int64
MntSpent               int64
NumPurchases           int64
Age                    int64
DaysEnrolled           int64
Married                 bool
Single                  bool
Together                bool
Widow                   bool
dtype: object

### Splitting Output from Input Space

In [8]:
X = df.drop('Response', axis= 1) 
y = df['Response']

### Splitting Training and Test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Modelling

In [10]:
#Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

In [11]:
#Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([ 0.02970468,  0.00114245, -0.07000851,  0.01731303,  0.00591031,
        0.16376209, -0.08432386, -0.0180104 ,  0.05831902, -0.03111151,
        0.01311282, -0.03363883,  0.00835653])

In [12]:
y_preds_reg = model.predict(X_test)
y_preds_reg

array([ 0.07816582,  0.14715814,  0.37243212,  0.15808636,  0.17480179,
        0.07299931,  0.34222789, -0.01035451,  0.35131402, -0.00213231,
        0.15886947,  0.14437504,  0.16338659,  0.70154006,  0.18528789,
        0.13849264,  0.01580856,  0.05368099,  0.3097376 ,  0.07323415,
        0.39523137,  0.11718222,  0.41607408,  0.1426898 ,  0.25386681,
        0.10115331,  0.12571526,  0.15239345,  0.02759271,  0.15045378,
        0.16690177,  0.15440294,  0.13548307,  0.24687354,  0.15610171,
        0.3981934 ,  0.47738919,  0.34707408,  0.20560481, -0.08915522,
       -0.01104964,  0.15100366,  0.2419518 ,  0.03195836,  0.0298574 ,
        0.03662392,  0.09240813,  0.3618015 ,  0.42442872,  0.08516211,
        0.24510889,  0.09394253,  0.01564279,  0.23916221,  0.02791649,
        0.31103859,  0.05851035,  0.15086712,  0.07190918,  0.17373946,
        0.09857687,  0.05534348,  0.22675186,  0.44135377,  0.10962949,
        0.4639847 ,  0.70422319,  0.68595667,  0.06221031,  0.11

In [13]:
#convert regression values into binary classification
y_preds = np.where(y_preds_reg >= 0.5, 1, 0)
y_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
acc = accuracy_score(y_test, y_preds)
prec = precision_score(y_test, y_preds)
rec = recall_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)
auc = roc_auc_score(y_test, y_preds)

In [15]:
print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8494
Precision: 0.4211
Recall: 0.1250
F1: 0.1928
AUC: 0.5481
