In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('clean.parquet')

In [None]:
df.sample(10)

Unnamed: 0,Education,Marital_Status,Income,Recency,NumDealsPurchases,Response,Complain,MntSpent,NumPurchases,Age,DaysEnrolled
566,Master,Married,88097.0,24,1,1,0,643,19,46,4204
399,Graduation,Married,80763.0,17,1,1,0,950,22,74,3842
654,Graduation,Divorced,45146.0,28,2,0,0,38,4,61,3873
551,Master,Married,36262.0,24,1,1,0,181,11,41,3806
1825,PhD,Together,54132.0,81,1,0,0,39,4,55,3657
971,PhD,Single,34554.0,43,2,0,0,48,5,65,3615
25,Master,Single,33168.0,0,3,0,0,118,7,59,4148
197,PhD,Widow,67680.0,8,1,0,0,594,18,73,3759
443,Graduation,Married,56939.0,19,2,0,0,393,16,48,3808
1059,Graduation,Married,61456.0,47,4,0,0,1023,29,49,3983


### One-Hot Encoding for Categorical Values

In [None]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True) 
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [None]:
df = one_hot_encode(df, 'Education')
df = one_hot_encode(df, 'Marital_Status')

In [None]:
df.dtypes

Income               float64
Recency                int64
NumDealsPurchases      int64
Response               int64
Complain               int64
MntSpent               int64
NumPurchases           int64
Age                    int64
DaysEnrolled           int64
Basic                   bool
Graduation              bool
Master                  bool
PhD                     bool
Married                 bool
Single                  bool
Together                bool
Widow                   bool
dtype: object

### Splitting Output from Input Space

In [None]:
X = df.drop('Response', axis= 1) 
y = df['Response']

### Splitting Training and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Modelling

In [None]:
#Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

In [None]:
#PolynomialFeatures 
poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test) 

In [None]:
#Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([ 8.53161366e+07,  1.40190623e-02, -6.62011504e-02,  1.69779836e-02,
       -2.82465336e+11,  3.82430134e-02, -1.65517061e-02, -1.58745289e-02,
        6.29755466e-02,  9.16958613e+10, -1.51907177e+11, -6.58873184e+11,
       -1.02284535e+12, -1.09455120e+12, -8.06850859e+11,  4.24688308e+11,
        5.55662754e+11,  3.10120768e-02, -3.08651122e-02, -1.91215658e-02,
        8.84351292e-02, -1.44078123e-02, -8.33074421e-03,  9.12033988e-03,
        1.21815670e-02,  4.55998711e-02,  3.46410602e-02,  6.65879894e-02,
        9.02549389e-03,  1.27786118e-02, -9.69136998e-03,  6.84458705e-04,
       -2.20202351e-02,  3.69873047e-02, -6.30416870e-02,  2.08587646e-02,
        6.40869141e-03,  1.43890381e-02,  3.73077393e-03, -1.50299072e-02,
       -1.83410645e-02, -1.95312500e-03,  4.68444824e-03, -2.15530396e-02,
       -2.10571289e-03,  6.79016113e-03,  8.23211670e-03, -5.27954102e-03,
        3.17382812e-03,  3.91292572e-03,  1.73950195e-02,  1.00650787e-02,
        1.13220215e-02,  

In [None]:
y_preds_reg = model.predict(X_test)
y_preds_reg

array([ 3.60107422e-02,  4.89501953e-02,  3.38012695e-01,  4.72412109e-02,
        1.46606445e-01, -5.40771484e-02,  5.48461914e-01,  1.42822266e-02,
        2.97241211e-01,  1.14379883e-01,  7.23876953e-02,  1.69921875e-01,
        2.54516602e-01,  8.74633789e-01,  1.31958008e-01,  3.21044922e-02,
        3.06396484e-02,  1.10839844e-01,  1.72607422e-01,  7.23876953e-02,
        6.10961914e-01,  2.16064453e-01,  3.69506836e-01,  1.29516602e-01,
        2.17163086e-01,  1.21337891e-01,  1.08520508e-01,  2.56835938e-01,
        5.12695312e-03, -1.27929688e-01,  2.13989258e-01,  8.34960938e-02,
        2.99072266e-02,  1.67602539e-01,  4.77294922e-02,  4.53125000e-01,
        4.04174805e-01,  2.12524414e-01,  4.56665039e-01, -7.23876953e-02,
        3.12500000e-02,  1.61254883e-01,  2.08251953e-01,  3.96728516e-02,
        2.89184570e-01,  3.32031250e-02,  1.23901367e-01,  3.27636719e-01,
        3.59619141e-01, -5.43212891e-02,  2.75024414e-01,  8.17871094e-03,
        7.77587891e-02,  

In [None]:
#convert regression values into binary classification
y_preds = np.where(y_preds_reg >= 0.5, 1, 0)
y_preds

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
acc = accuracy_score(y_test, y_preds)
prec = precision_score(y_test, y_preds)
rec = recall_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)
auc = roc_auc_score(y_test, y_preds)

In [None]:
print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8517
Precision: 0.4583
Recall: 0.1719
F1: 0.2500
AUC: 0.5689
