In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

sns.set()

path_to_train_csv = "./IA2-train.csv"
path_to_dev_csv = "./IA2-dev.csv"

originalData = pd.read_csv(path_to_train_csv)
testData = pd.read_csv(path_to_dev_csv)

In [102]:
originalData.head()

Unnamed: 0,dummy,Gender,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Region_Code_0,Region_Code_1,...,Policy_Sales_Channel_153,Policy_Sales_Channel_154,Policy_Sales_Channel_155,Policy_Sales_Channel_156,Policy_Sales_Channel_157,Policy_Sales_Channel_158,Policy_Sales_Channel_159,Policy_Sales_Channel_160,Policy_Sales_Channel_163,Response
0,1,1,24,1,0,1,2630,187,0,0,...,0,0,0,1,0,0,0,0,0,1
1,1,0,44,1,0,1,56865,150,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,32,1,0,1,50126,188,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,56,1,0,1,45578,89,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,32,1,0,1,2630,227,0,0,...,0,0,0,0,0,0,0,0,0,1


## Data processing

Normalize Age, Annual_Premium, Vintage:

In [103]:
normalizeData = originalData.copy()
normalizeData['Age'] = (originalData['Age'] - originalData['Age'].mean()) / originalData['Age'].std()
normalizeData['Annual_Premium'] = (originalData['Annual_Premium'] - originalData['Annual_Premium'].mean()) / originalData['Annual_Premium'].std()
normalizeData['Vintage'] = (originalData['Vintage'] - originalData['Vintage'].mean()) / originalData['Vintage'].std()

normalizeData.head()

Unnamed: 0,dummy,Gender,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Region_Code_0,Region_Code_1,...,Policy_Sales_Channel_153,Policy_Sales_Channel_154,Policy_Sales_Channel_155,Policy_Sales_Channel_156,Policy_Sales_Channel_157,Policy_Sales_Channel_158,Policy_Sales_Channel_159,Policy_Sales_Channel_160,Policy_Sales_Channel_163,Response
0,1,1,-1.171848,1,0,1,-1.605168,0.382089,0,0,...,0,0,0,1,0,0,0,0,0,1
1,1,0,0.242588,1,0,1,1.488182,-0.060277,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,-0.606074,1,0,1,1.103816,0.394044,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,1.091249,1,0,1,0.844416,-0.789583,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,-0.606074,1,0,1,-1.605168,0.860322,0,0,...,0,0,0,0,0,0,0,0,0,1


Setting up validation data with the same normalization parameter as train:

In [104]:
validationData = testData.copy()
validationData['Age'] = (validationData['Age'] - originalData['Age'].mean()) / originalData['Age'].std()
validationData['Annual_Premium'] = (validationData['Annual_Premium'] - originalData['Annual_Premium'].mean()) / originalData['Annual_Premium'].std()
validationData['Vintage'] = (validationData['Vintage'] - originalData['Vintage'].mean()) / originalData['Vintage'].std()

validationData.head()

Unnamed: 0,dummy,Gender,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Region_Code_0,Region_Code_1,...,Policy_Sales_Channel_153,Policy_Sales_Channel_154,Policy_Sales_Channel_155,Policy_Sales_Channel_156,Policy_Sales_Channel_157,Policy_Sales_Channel_158,Policy_Sales_Channel_159,Policy_Sales_Channel_160,Policy_Sales_Channel_163,Response
0,1,0,-1.24257,1,1,0,-0.259802,1.230953,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,-0.0403,1,0,1,0.288714,1.338555,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,1,-0.747518,1,0,1,0.226259,0.406,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,-1.101127,1,1,0,-1.605168,0.274486,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,-0.111021,1,0,1,1.489208,-1.638447,0,0,...,0,0,0,0,0,0,0,0,0,1


Create a table with only features for algorithm implementations:

In [105]:
featuresOnlyData = normalizeData.copy().drop(['Response'], axis = 1)
featuresOnlyData.head()

Unnamed: 0,dummy,Gender,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Region_Code_0,Region_Code_1,...,Policy_Sales_Channel_152,Policy_Sales_Channel_153,Policy_Sales_Channel_154,Policy_Sales_Channel_155,Policy_Sales_Channel_156,Policy_Sales_Channel_157,Policy_Sales_Channel_158,Policy_Sales_Channel_159,Policy_Sales_Channel_160,Policy_Sales_Channel_163
0,1,1,-1.171848,1,0,1,-1.605168,0.382089,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0.242588,1,0,1,1.488182,-0.060277,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,-0.606074,1,0,1,1.103816,0.394044,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1.091249,1,0,1,0.844416,-0.789583,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,-0.606074,1,0,1,-1.605168,0.860322,0,0,...,0,0,0,0,0,0,0,0,0,0


## Part 1: Average Perceptron

Algorithm 1:

In [113]:
MAX_ITER = 100

def averagePerceptron(inputData, resultData, maxIter = MAX_ITER):
    w = np.repeat(0.0, len(inputData.columns))
    wBar = w.copy()
    s = 1
    wRecord = []
    wBarRecord = []
    for i in tqdm(range(0,maxIter)):
        for j in range(0,len(inputData.index)):
            correctPredict = resultData[j]*(inputData.loc[j].mul(w).sum())
            if (correctPredict <= 0):
                w += resultData[j]*inputData.loc[j]
            wBar = (s*wBar + w)/(s+1)
            s += 1
        wRecord.append(w)
        wBarRecord.append(wBar)
    return {'w': w, 'wBar': wBar, 'wRecord': wRecord, 'wBarRecord': wBarRecord}

### 1a:

In [109]:
result1a = averagePerceptron(featuresOnlyData, normalizeData['Response'])

100%|██████████| 100/100 [07:24<00:00,  4.44s/it]


In [112]:
len(result1a['wRecord'])

101