# Insurance Purchase Prediction Using Deep Neural Network

In this notebook, a deep neural network with logistic sigmoid is built using numpy. 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
#load data set
insurance_df = pd.read_csv("training_data.csv",header =0)
insurance_df.head()

Unnamed: 0,STATE,REGION,SEX,PROFESSION,AGE,HAS_CHILDREN,SALARY,N_OF_DEPENDENTS,CAR_OWNERSHIP,HOUSE_OWNERSHIP,...,MORTGAGE_AMOUNT,N_TRANS_ATM,N_MORTGAGES,N_TRANS_TELLER,CREDIT_CARD_LIMITS,N_TRANS_KIOSK,N_TRANS_WEB_BANK,LTV,LTV_BIN,BUY_INSURANCE
0,CA,West,M,IT Staff,63,0,59461,0,1,0,...,0,0,0,0,900,2,0,22165.25,HIGH,0
1,CA,West,M,PROF-9,36,1,60271,1,1,1,...,3063,5,1,1,900,0,3063,22167.75,HIGH,1
2,FL,South,M,Programmer/Developer,21,0,64738,3,1,0,...,0,2,0,2,1000,4,0,13784.5,LOW,1
3,WA,West,F,IT Staff,47,1,65071,1,1,1,...,15000,6,1,7,500,3,3000,24467.75,HIGH,1
4,UT,Southwest,M,Technical Writer,26,0,61674,3,1,1,...,300,0,1,0,1500,5,300,23518.5,HIGH,0


## Preprocessing

In [2]:
# convert categorical variables to dummy variables
cat_vars=['STATE','REGION','SEX','PROFESSION','MARITAL_STATUS']
cat_df = pd.get_dummies(insurance_df[cat_vars], prefix='var')
insurance_df = pd.concat([insurance_df,cat_df],axis = 1) # concat horizontally with newly dummy features
# remove the original categorical variables
data_vars = insurance_df.columns.values.tolist()
selected_columns = list(set(data_vars).difference(set(cat_vars)))
# np.array(selected_columns)

# now only get variables after converting to dummy variables
data = insurance_df[selected_columns]
data.head()

Unnamed: 0,var_PROF-51,var_PROF-54,var_PROF-21,var_Software Engineer,var_PROF-20,var_PROF-43,var_Construction Laborer,var_PROF-4,var_South,var_Mason,...,var_PROF-32,var_Plumber,var_Midwest,var_Administrative Assistant,HAS_CHILDREN,var_School Teacher,var_WA,var_PROF-36,var_PROF-44,var_UT
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Simply set a threshold on the correlation with the target variable

In [3]:
# Simple thresholds
relevant_features = []
for col in data.columns:
    thres = 0.1
    try:
        corr = data['BUY_INSURANCE'].corr(data[col])
    except:
        print(col)
        
    if abs(corr) > thres:
        relevant_features.append(col)
    
relevant_features

LTV_BIN


['N_OF_DEPENDENTS',
 'MONTHLY_CHECKS_WRITTEN',
 'var_Programmer/Developer',
 'MONEY_MONTLY_OVERDRAWN',
 'var_DIVORCED',
 'CHECKING_AMOUNT',
 'BUY_INSURANCE',
 'var_Nurse',
 'var_SINGLE',
 'var_M',
 'var_F',
 'var_PROF-8',
 'N_TRANS_TELLER',
 'BANK_FUNDS',
 'N_TRANS_ATM',
 'var_NC']

In [4]:
data = data[relevant_features]

In [5]:
from sklearn.model_selection import train_test_split
X = data[data.columns.difference(['BUY_INSURANCE','LTV_BIN'])]

In [6]:
# Min-max scaling. Not really necessary for NN, but a must if using any distance measure
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = data[['BUY_INSURANCE']].values.astype('float64')

This is an unbalance data set so we use imblearn to under/over-sample the data set.

In [7]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_resampled, y_resampled = RandomUnderSampler(random_state=0).fit_sample(X_train, y_train)
X_resampled, y_resampled = shuffle(X_resampled, y_resampled)
X_resampled.shape

(236, 15)

Now the class distribution is balanced

In [8]:
y_train_df= pd.DataFrame(data=y_resampled,columns=['BUY_INSURANCE'])
y_train_df.groupby(['BUY_INSURANCE'])[['BUY_INSURANCE']].count()

Unnamed: 0_level_0,BUY_INSURANCE
BUY_INSURANCE,Unnamed: 1_level_1
0.0,118
1.0,118


## Model Building

In summary, we decompose the model training into 4 parts. 

* `normalize()` is to scale input matrix `X`.

* `sigmoid()` and `gradient()` are about logistic sigmoid and its gradient respectively.

* `thetaL()`,`computeCost()`, `computeGrad()`, `forward_propagation()` are about neural network training. Additionally, `thetaL()` is to transform the flatten $\theta$ into matrix between two layers in the network. 

* `optimizeCost()` is to minimize the cost function by gradient descent. 

Besides, `predict()` is to output the results based on input $\theta$. 

In [9]:
import math

# function that normalizes each predictor
def normalize(X):
    """
    It normalized each column of X
    Xnorm returns the normalized X
    """
    p = X.shape[1]
    Xnorm = X.copy()
    for j in range(p):
        Xnorm[:, j] = (X[:, j] - X[:, j].mean()) / math.sqrt(X[:, j].var())
    return Xnorm

# function that return sigmoid of z
def sigmoid(z):
    """
    logistic 
    """
    g = 1 / (1 + np.exp(-z))
    return g

def gradient(z):
    """
    it returns the gradient of sigmoid
    here it is for logistic sigmoid a(1 - a)
    """
    g = z * (1 - z)
    return g

# add a function that returns theta^l
def thetaL(theta, L, input_num, hidden_num, label_num):
    """
    Suppose at each layer in hidden layers, the number of nodes 
    is the same
    
    L: the number of total layers
    input_num: the number of input features
    hidden_num: the number of nodes in hidden layers 
    label_num: the number of output labels 
    returns theta^l
    """
    thetal = []
    for i in range(L-1): # [0,L-2]
        if (i == 0):
            T = theta[0:(1 + input_num) * hidden_num].reshape(hidden_num, 1 + input_num, order="F")
            thetal.append(T)
        elif (i == L - 2):
            T = theta[-(hidden_num + 1):].reshape(label_num, hidden_num + 1, order="F")
            thetal.append(T)
        else:
            index_start = (1 + input_num) * hidden_num + (i - 1) * (hidden_num + 1) * hidden_num
            index_end = index_start + hidden_num * (hidden_num + 1)
            T = theta[index_start:index_end].reshape(hidden_num + 1, hidden_num, order = "F")
            thetal.append(T)
    
    return thetal
                

# cost function
def computeCost(X, y, theta, L, input_num, hidden_num, label_num, lambd):
    """
    it computes cost of logistic sigmoid
    """
    # Forward propagation
    A2, t = forward_propagation(X, theta, L, input_num, hidden_num, label_num)
    L = 3
    p, n = X.shape
    # The ith row of h represents h_k
    h = A2[L - 1].T
    M1 = y @ np.log(h)
    M2 = (1 - y) @ np.log(1 - h)
    # regularation
    R = 0
    for i in range(L-1):
        R += np.sum(t[i]**2)
    J = 1 / n * np.sum(-M1 - M2) + 0.5 * lambd / n * R
    
    return J

# gradient function
def computeGrad(X, y, theta, lambd, L, input_num, hidden_num, label_num):
    
    # t stores theta in a list
    # A stores a in a list
    A, t = forward_propagation(X, theta, L, input_num, hidden_num, label_num)
    p, n = X.shape
    # backpropagation
    # initialize delta 
    delta = [i for i in range(L)]
    # delta^L
    delta[L - 1] = A[L - 1] - y 
    for lay in range(L-2, 0, -1):
        M = t[lay].T @ delta[lay + 1 ] 
        delta[lay] = gradient(A[lay]) * M 
        # remove bias
        delta[lay] = delta[lay][1:, :] 
    
    
    # compute gradient
    # initialize Delta
    Delta = []
    grad = []
    for lay in range(L - 1):#0 1
        Delta = np.zeros(t[lay].shape) 
        Delta = Delta + delta[lay + 1] @ A[lay].T 
        grad.append(1 / n * Delta + lambd * t[lay])
    
    return grad

def predict(theta, X, L, input_num, hidden_num, label_num, score = True, threshold = 0.5):
    """
    for this binary classification,
    it returns the predicted value
    if score == True, it returns the score \in [0,1]
    else it returns {0,1} depending on threshold
    """
    A, t = forward_propagation(X, theta, L, input_num, hidden_num, label_num)
    sc = A[L - 1]
    if score == True:
        return sc
    else:
        th_index_g = sc > threshold
        sc[th_index_g] = 1
        th_index_l = sc < threshold
        sc[th_index_l] = 0
        return sc
    


def optimizeCost(theta, X, y, L, input_num, hidden_num, label_num, maxiter, lambd):
    """
    gradient descent
    """
    stop = 1
    convergence = False
    tol = 10**(-8)
    # create a list storing cost in each step
    cost = []
    for i in range(maxiter):
        if (stop > tol) :
            theta_old = theta.copy()
            cost_t = computeCost(X, y, theta_old, L, input_num, hidden_num, label_num, lambd)
            cost.append(cost_t)
            grad = computeGrad(X, y, theta_old, lambd, L, input_num, hidden_num, label_num)
            # flatten grad as the same shape of theta
            # initialize an np.array for flatten grad
            grad_f = np.zeros(theta_old.shape)
            # create a sequence storing the length of grad[i]
            seq = []
            for i in range(L - 1):
                m, n = grad[i].shape
                seq.append(m * n)
            for i in range(len(seq)):
                if i == 0:
                    grad_f[0: seq[i]] = grad[i].reshape([seq[i], 1])
                else:
                    grad_f[seq[i - 1]: seq[i - 1] + seq[i]] = grad[i].reshape([seq[i], 1])
            # gradient descent
            theta = theta_old - lambd * grad_f                
            stop = np.sum((theta - theta_old)**2) / (np.sum(theta_old**2))
        else:
            convergence = True
            return convergence, theta, cost, i
    return convergence, theta, cost, maxiter

def forward_propagation(X, theta, L, input_num, hidden_num, label_num):
    """
    it returns list A2 storing a
    list t storing theta as matrix
    """
    # Forward propagation
    p, n = X.shape
    # theta 
    t = thetaL(theta, L, input_num, hidden_num, label_num)
    # initialize A2, list stores a^l, 
    A2 = []
    # create a^1 and add bias
    A2.append(np.row_stack((np.ones([1, n]), X)))
    
    for lay in range(1,L): # [1,L-1]
        z = t[lay - 1] @ A2[lay - 1]
        a = sigmoid(z)
        if lay == L - 1:
            A2.append(a)
        else:
            A2.append(np.row_stack((np.ones([1, n]),a)))       
    return A2, t

Now we train our model using the given data set.

In [10]:
X = normalize(X_train.T)
Y = y_train.T
input_num = X.shape[0]
hidden_num = input_num + 1
label_num = Y.shape[0]
lambd = 0.001
maxiter = 10000
L = 3
Theta1 = np.random.randn(hidden_num, 1 + input_num) * 0.01
Theta2 = np.random.randn(1, 1 + hidden_num) * 0.01
theta = np.concatenate((Theta1.reshape(hidden_num * (1 + input_num), 1, order="F"),
                            Theta2.reshape(label_num * (1 + hidden_num), 1, order="F")))
theta_o = optimizeCost(theta, X, Y, L, input_num, hidden_num, label_num, maxiter, lambd)

We first see if the model is convergence.

In [11]:
theta_o[0]

True

The model converges, then we plot the cost at each iteration.

In [12]:
import matplotlib.pyplot as plt
plt.plot(theta_o[2])
plt.title('Cost at each Iteration')
plt.ylabel('Cost')
plt.xlabel('Iteration')
plt.show()

<Figure size 640x480 with 1 Axes>

Next, we check the predicted results.

In [13]:
from sklearn import metrics
pred = predict(theta_o[1], X_test.T, L, input_num, hidden_num, label_num, score = False)
print('\nTraining Accuracy: '+ str(np.mean((pred.T == y_test) * 100)))
print("Accuracy:",metrics.accuracy_score(y_test, pred.T))
print("Precision:",metrics.precision_score(y_test, pred.T))
print("Recall:",metrics.recall_score(y_test, pred.T))
print("Confusion Matrix:\n")
print(metrics.confusion_matrix(y_test, pred.T))


Training Accuracy: 70.4225352112676
Accuracy: 0.704225352112676
Precision: 0.0
Recall: 0.0
Confusion Matrix:

[[150   0]
 [ 63   0]]


## Scoring the leaderboard

We not use our model to score the test feature dataset in the Insurance Purchase Prediction project/contest below. 
http://www.scriptedin.com/contests/view/6

Now going through the same preprocessing step above for the test dataset

In [14]:
test_feature_df = pd.read_csv("test_features.csv",header =0)
test_feature_df.head()

# convert categorical variables to dummy variables
cat_vars=['STATE','REGION','SEX','PROFESSION','MARITAL_STATUS']
cat_list = pd.get_dummies(test_feature_df[cat_vars], prefix= 'var')
test_feature_df=pd.concat([test_feature_df,cat_list],axis = 1) # concat horizontally with newly dummy features


Reindexing the dataset. This is because the number of columns/variables are different. Filling zero in. 

In [15]:
transformed_cols = insurance_df.columns
test_feature_df = test_feature_df.reindex(columns = transformed_cols.difference(['BUY_INSURANCE']), fill_value=0)

Now remove the original categorical variables. We don't use them to train the model.

In [16]:
# remove the original categorical variables
data_vars=test_feature_df.columns.values.tolist()
selected_columns = list(set(data_vars).difference(set(cat_vars)))
np.array(selected_columns)

# now only get variables after converting to dummy variables
test_data = test_feature_df[selected_columns]
test_data.shape
test_data.head()

Unnamed: 0,var_PROF-51,var_PROF-54,var_PROF-21,var_Software Engineer,var_PROF-20,var_PROF-43,var_Construction Laborer,var_PROF-4,var_South,var_Mason,...,var_PROF-32,var_Plumber,var_Midwest,var_Administrative Assistant,HAS_CHILDREN,var_School Teacher,var_WA,var_PROF-36,var_PROF-44,var_UT
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X_test_more = test_data[data.columns.difference(['BUY_INSURANCE','LTV_BIN'])]
X_test_more.shape

(305, 15)

Min-max scaling

In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_test_more = scaler.fit_transform(X_test_more)
X_test_more.shape

(305, 15)

Assuming My_y_pred is the predicted result as a list. Export the result to a csv file as follows. The first column is numbered starting from 1;The second is the label of the prediction. The first row is the header. Submit the csv to the project at https://www.scriptedin.com/contests/view/6 via Add Submission to see where you are on the leaderboard

In [19]:
y_pred = predict(theta_o[1], normalize(X_test_more).T, L, input_num, hidden_num, label_num, score =False)
My_y_pred = []
for i in y_pred[0,:]:
    My_y_pred.append(i)

pd.DataFrame(list(zip(list(range(1, len(My_y_pred) + 1)),My_y_pred)), columns=['num','label']).to_csv(r'submission.csv', index=None, header=True)

Our score on the learderboard is 0.698361. 