In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('clean.parquet')

In [3]:
df.sample(10)

Unnamed: 0,Education,Marital_Status,Income,Recency,NumDealsPurchases,Response,Complain,MntSpent,NumPurchases,Age,DaysEnrolled
1024,Graduation,Together,77298.0,46,1,0,0,832,23,45,4027
140,Graduation,Married,26304.0,5,1,0,0,10,3,55,3895
1311,PhD,Widow,25358.0,57,2,0,0,24,4,77,3866
97,Master,Widow,47570.0,3,3,1,0,88,6,75,3920
716,Graduation,Married,39922.0,30,2,0,0,100,7,41,4024
387,Graduation,Married,36715.0,16,6,1,0,307,13,53,4113
1094,Graduation,Married,74190.0,49,2,0,0,318,17,49,3365
809,Graduation,Single,55801.0,35,6,0,0,422,15,49,3804
1136,PhD,Married,16927.0,50,5,0,0,45,7,48,3794
1562,Graduation,Together,40689.0,69,7,0,0,300,13,73,3992


### One-Hot Encoding for Categorical Values

In [4]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True) 
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [5]:
df = one_hot_encode(df, 'Education')
df = one_hot_encode(df, 'Marital_Status')

In [6]:
df.dtypes

Income               float64
Recency                int64
NumDealsPurchases      int64
Response               int64
Complain               int64
MntSpent               int64
NumPurchases           int64
Age                    int64
DaysEnrolled           int64
Basic                   bool
Graduation              bool
Master                  bool
PhD                     bool
Married                 bool
Single                  bool
Together                bool
Widow                   bool
dtype: object

### Splitting Output from Input Space

In [7]:
X = df.drop('Response', axis= 1) 
y = df['Response']

### Splitting Training and Test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Modelling

In [9]:
#Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

In [10]:
#Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([-0.00320561, -0.07006386,  0.01577082,  0.00573416,  0.16345729,
       -0.08297773, -0.01890571,  0.05923792, -0.01187121,  0.01071516,
        0.02023727,  0.03638433, -0.03098353,  0.01326754, -0.0332351 ,
        0.008632  ])

In [11]:
y_preds_reg = model.predict(X_test)
y_preds_reg

array([ 8.42595343e-02,  1.11770189e-01,  3.75669905e-01,  1.65766621e-01,
        1.87180647e-01,  6.91079858e-02,  3.46176732e-01, -1.25398680e-02,
        3.59509818e-01,  1.00643815e-02,  1.67228554e-01,  1.60857633e-01,
        1.70923677e-01,  6.94061956e-01,  1.94815479e-01,  1.31282025e-01,
        2.38540565e-02,  6.61355362e-02,  3.24549114e-01,  7.98606310e-02,
        3.97180281e-01,  1.25358399e-01,  4.08528942e-01,  1.05591447e-01,
        2.62912348e-01,  1.15577777e-01,  9.23421918e-02,  1.56543986e-01,
        3.68459539e-02,  1.67098815e-01,  1.69822035e-01,  1.56294459e-01,
        1.49378674e-01,  2.54154408e-01,  1.49246965e-01,  4.03190550e-01,
        4.66477571e-01,  3.52402977e-01,  2.10614931e-01, -7.58904592e-02,
       -1.78534483e-03,  1.55979976e-01,  2.56158572e-01,  4.66551679e-02,
       -5.78501827e-03,  4.73657163e-02,  6.46230377e-02,  3.69115802e-01,
        4.28696153e-01,  8.24393526e-02,  2.50988725e-01,  8.55885973e-02,
        2.97817298e-02,  

In [12]:
#convert regression values into binary classification
y_preds = np.where(y_preds_reg >= 0.5, 1, 0)
y_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
acc = accuracy_score(y_test, y_preds)
prec = precision_score(y_test, y_preds)
rec = recall_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)
auc = roc_auc_score(y_test, y_preds)

In [14]:
print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8472
Precision: 0.3889
Recall: 0.1094
F1: 0.1707
AUC: 0.5403
