In [1]:
import glob
import os
import numpy as np
import csv
import multiprocessing as mp

In [2]:
def loadData(filename):   
    with open(filename,'r') as file:  
        dataSet = [[1]+line[0].split('\t') for line in csv.reader(file)]
    return np.array(dataSet, dtype = "float32") 

class dataLoader():
    def __init__(self, fn):
        self.dataSet = loadData(fn)
        self.len = len(self.dataSet)
        self.nonlinearized = False
        self.expandedDataSet = None
        self.fn = fn
    
    def getData(self, i):
        dataSet = self.expandedDataSet if self.nonlinearized else self.dataSet
        index = min(i,self.len-1)
        xn, yn = dataSet[index][:-1], dataSet[index][-1]
        return np.reshape(xn,(len(xn),1)), int(yn)
    
    def getRandomData(self):
        return self.getData(np.random.randint(self.len))
    
    def getDataSet(self):
        return [self.getData(i) for i in range(self.len)]
    
    def nonlinearExpension(self, q):
        self.nonlinearized = False
        xSet = [arg[0][1:] for arg in self.getDataSet()]
        ySet = [arg[1] for arg in self.getDataSet()]
        expandedList = [[float(xi)**(i+1) for i in range(q) for xi in xn] for xn in xSet]
        expandedList = [[1]+xn+[ySet[i]] for i, xn in enumerate(expandedList)]
        self.expandedDataSet = np.array(expandedList)
        self.nonlinearized = True
        print(self.fn,'has been nonlinearly expanded')
        
    def getMatX(self):
        dataSet = self.expandedDataSet if self.nonlinearized else self.dataSet
        return dataSet[:,:-1]
    
    def getVecY(self):
        dataSet = self.expandedDataSet if self.nonlinearized else self.dataSet
        return dataSet[:,-1:]
            
    def __len__(self):
        return self.len

In [3]:
trainData = dataLoader('hw3_train.dat')
testData = dataLoader('hw3_test.dat')

In [4]:
# question 14
def linearRegression(data = trainData):
    X, y = data.getMatX(), data.getVecY()
    return np.linalg.pinv(X).dot(y)

def squaredE(wt, data = trainData):
    X, y = data.getMatX(), data.getVecY()
    return np.linalg.norm(X.dot(wt)-y)**2/len(data)

w_lin = linearRegression()
Ein_lin = squaredE(w_lin)

print('Ein =', Ein_lin)

Ein = 0.6053223755451654


In [5]:
# question 15
def stochasticGradientDescent(lr = 0.001):
    wt = np.zeros((11,1))
    iteration = 0
    while squaredE(wt) > 1.01*Ein_lin:
        xn, yn = trainData.getRandomData()
        wt += -lr*2*(wt.transpose().dot(xn)-yn)*xn
        iteration += 1
    return iteration

def mpStochasticGradientDescent(index):
    return stochasticGradientDescent()

pool = mp.Pool(mp.cpu_count())
iterationList = pool.map(mpStochasticGradientDescent, range(100))
print('average:', np.average(iterationList))

average: 1650.27


In [5]:
# question 16
def theta(s):
    return 1/(1+np.exp(-s))

def crossEntropy(wt):
    cossEntropyList = [-np.log(theta(yn*wt.transpose().dot(xn))) for xn, yn in trainData.getDataSet()]
    return np.average(cossEntropyList)
        
def stochasticGradientDescent(lr = 0.001, max_ite = 500):
    wt = np.zeros((11,1))
    for i in range(max_ite):
        xn, yn = trainData.getRandomData()
        wt += lr*theta(-yn*wt.transpose().dot(xn))*yn*xn
    return crossEntropy(wt)

def mpStochasticGradientDescent(index):
    return stochasticGradientDescent()

pool = mp.Pool(mp.cpu_count())
iterationList = pool.map(mpStochasticGradientDescent, range(1000))
print('average:', np.average(iterationList))

average: 0.5684016888300671


In [6]:
# question 17
def stochasticGradientDescent(lr = 0.001, max_ite = 500):
    wt = w_lin
    for i in range(max_ite):
        xn, yn = trainData.getRandomData()
        wt += lr*theta(-yn*wt.transpose().dot(xn))*yn*xn
    return crossEntropy(wt)

def mpStochasticGradientDescent(index):
    return stochasticGradientDescent()

pool = mp.Pool(mp.cpu_count())
iterationList = pool.map(mpStochasticGradientDescent, range(1000))
print('average:', np.average(iterationList))

average: 0.442359


In [7]:
# question 18
def sign(x):
    return 2*(0 < x)-1

def zeroOneE(data):
    w_lin = linearRegression()
    logicList = [sign(w_lin.transpose().dot(xn)) == yn for xn, yn in data.getDataSet()]
    return np.average(logicList)

print('ans =', abs(zeroOneE(trainData)-zeroOneE(testData)))

ans = 0.3226666666666667


In [8]:
# question 19
trainData.nonlinearExpension(3)
testData.nonlinearExpension(3)
print('ans =', abs(zeroOneE(trainData)-zeroOneE(testData)))

hw3_train.dat has been nonlinearly expanded
hw3_test.dat has been nonlinearly expanded
ans = 0.3736666666666667


In [9]:
# question 20
trainData.nonlinearExpension(10)
testData.nonlinearExpension(10)
print('ans =', abs(zeroOneE(trainData)-zeroOneE(testData)))

hw3_train.dat has been nonlinearly expanded
hw3_test.dat has been nonlinearly expanded
ans = 0.44666666666666666
