In [63]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [64]:
df = pd.read_parquet('clean.parquet')

In [65]:
df.sample(10)

Unnamed: 0,Education,Marital_Status,Income,Recency,NumDealsPurchases,Response,Complain,MntSpent,NumPurchases,Age,DaysEnrolled
2162,Graduation,Married,65747.0,96,4,0,0,411,15,55,3520
1025,Master,Together,43018.0,46,1,0,0,56,6,50,4058
1772,Graduation,Together,63381.0,78,4,0,0,763,22,64,4304
372,2n Cycle,Married,32146.0,16,2,0,0,52,6,41,3365
950,Graduation,Together,28587.0,42,3,0,0,43,6,46,3368
1824,Basic,Married,14421.0,81,1,0,0,2,3,28,3656
463,Graduation,Single,45906.0,20,2,0,0,335,13,60,3883
2220,Master,Together,24401.0,98,3,0,0,318,13,45,4191
1343,PhD,Married,58482.0,59,2,0,0,698,20,69,3367
119,Graduation,Single,70566.0,4,2,0,0,624,20,56,3908


### One-Hot Encoding for Categorical Values

In [66]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True) 
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [67]:
df = one_hot_encode(df, 'Education')
df = one_hot_encode(df, 'Marital_Status')

In [68]:
df.dtypes

Income               float64
Recency                int64
NumDealsPurchases      int64
Response               int64
Complain               int64
MntSpent               int64
NumPurchases           int64
Age                    int64
DaysEnrolled           int64
Basic                   bool
Graduation              bool
Master                  bool
PhD                     bool
Married                 bool
Single                  bool
Together                bool
Widow                   bool
dtype: object

### Splitting Output from Input Space

In [69]:
X = df.drop('Response', axis= 1) 
y = df['Response']

### Splitting Training and Test

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Modelling

In [71]:
#Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

In [72]:
#Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([ 0.00051835, -0.06744459,  0.00957543,  0.00307377,  0.07973415,
       -0.0266769 , -0.00480953,  0.05900836, -0.01028608,  0.00460138,
        0.00926648,  0.03717596, -0.04206181,  0.00817486, -0.04779051,
        0.00337671])

In [73]:
y_preds = model.predict(X_test)
y_preds

array([ 7.03725294e-02,  1.59823583e-01,  1.05966026e-01,  2.16599642e-01,
        1.97081096e-01,  8.98425779e-02, -7.62091870e-02,  1.20523708e-01,
        7.94373748e-02,  1.58501816e-01, -1.65575113e-02,  1.36494266e-01,
        1.17025288e-01,  9.68424312e-02, -5.47345551e-03,  3.19355062e-01,
        1.93765807e-02,  3.65351387e-01,  1.54152730e-01,  2.45822656e-01,
        1.55087645e-01,  1.42460083e-01,  1.98957414e-01,  3.42457800e-01,
        3.36484438e-01, -3.06367594e-02, -3.39353241e-02, -1.82672173e-01,
        1.71842371e-01,  2.91591743e-01,  2.29433972e-01,  1.59582167e-01,
        8.91624091e-02,  1.16247469e-01,  2.58635129e-01,  7.51869017e-02,
        7.53223855e-03,  2.16712838e-01,  9.19068161e-02,  1.32786913e-01,
        3.72960161e-01,  1.72677951e-02,  1.67276355e-01,  1.58404151e-01,
        2.12612920e-01,  1.21802298e-01,  1.72991593e-02,  2.14396837e-01,
        9.51267575e-02,  4.24872128e-02,  2.39584542e-01,  1.28419165e-01,
        9.26040623e-02,  

In [74]:
y_test

576     0
2107    0
445     0
559     0
204     1
       ..
475     0
1531    0
76      1
1997    0
2009    0
Name: Response, Length: 423, dtype: int64