In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('clean.parquet')

### Splitting Output from Input Space

In [3]:
X = df.drop('Response', axis= 1) 
y = df['Response']

### Splitting Training and Test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Modelling

In [5]:
#Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

In [6]:
#PolynomialFeatures 
poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test) 

In [7]:
#Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([ 3.74235523e+06,  3.63398932e-02,  1.50552656e-02, -7.02906165e-02,
        1.34912415e-02, -5.66136468e+10,  4.79136605e-02, -1.99344499e-02,
       -1.07412387e-02,  6.12942856e-02, -7.02399540e+11, -6.61339021e+11,
        1.81612649e+12, -5.91129249e+11,  1.60989516e-02,  3.17186262e-03,
        1.95227165e-02,  1.98620872e-02, -2.14862640e-02,  6.20003726e-03,
       -8.10111476e-03,  2.07443902e-02, -8.47407285e-03, -1.44721420e-03,
       -4.65072962e-03,  6.18088409e-03,  2.80771834e-02, -3.24368256e-02,
       -1.62781577e-03,  9.50321249e-02, -1.81908327e-02, -8.81695635e-03,
        8.44330876e-03,  8.91113281e-03,  1.62353516e-02, -1.29013062e-02,
        9.61303711e-04, -2.62451172e-02,  1.77917480e-02, -5.94177246e-02,
        2.09960938e-02,  2.80761719e-03,  1.31225586e-02,  1.15203857e-03,
       -2.50854492e-02, -6.01196289e-03,  3.75366211e-03,  4.66918945e-03,
       -3.30352783e-03,  9.15527344e-04,  4.94384766e-03,  1.75170898e-02,
        8.23974609e-03,  

In [8]:
y_preds_reg = model.predict(X_test)
y_preds_reg

array([ 6.02531433e-02,  1.11614227e-01,  2.98778534e-01,  7.41386414e-02,
        1.17626190e-01,  7.76290894e-03,  5.64403534e-01,  4.38690186e-04,
        2.75096893e-01,  1.16924286e-01,  8.53996277e-02,  1.64989471e-01,
        2.47753143e-01,  8.56395721e-01,  1.32488251e-01,  8.84208679e-02,
        4.04777527e-02,  9.76982117e-02,  1.50829315e-01,  8.03642273e-02,
        6.28368378e-01,  2.68749237e-01,  3.47515106e-01,  1.83544159e-01,
        2.15984344e-01,  1.13231659e-01,  1.41063690e-01,  2.69725800e-01,
       -1.05171204e-02, -1.71394348e-02,  2.02587128e-01,  6.29386902e-02,
        1.41105652e-02,  1.85009003e-01,  2.07328796e-02,  4.54540253e-01,
        4.25395966e-01,  2.19188690e-01,  3.77880096e-01, -4.74128723e-02,
        2.46391296e-02,  1.59770966e-01,  2.88768768e-01,  3.95011902e-02,
        2.12444305e-01,  4.58488464e-02,  1.44390106e-01,  3.11443329e-01,
        3.56731415e-01,  9.22775269e-03,  2.82176971e-01,  7.66410828e-02,
        6.74247742e-02,  

In [9]:
#convert regression values into binary classification
y_preds = np.where(y_preds_reg >= 0.5, 1, 0)
y_preds

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Evaluation

In [10]:
acc = accuracy_score(y_test, y_preds)
prec = precision_score(y_test, y_preds)
rec = recall_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)
auc = roc_auc_score(y_test, y_preds)
cm = confusion_matrix(y_preds, y_test, labels=[1,0])
print(cm)

[[ 12  10]
 [ 52 371]]


In [11]:
print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)
print("F1: %.4f" % f1)
print("AUC: %.4f" % auc)

Accuracy: 0.8607
Precision: 0.5455
Recall: 0.1875
F1: 0.2791
AUC: 0.5806
