In [1]:
import glob
import os
import numpy as np
import csv
import multiprocessing as mp

In [2]:
def loadData(filename):   
    with open(filename,'r') as file:  
        dataSet = [[1]+line[0].split('\t') for line in csv.reader(file)]
    return np.array(dataSet, dtype = "float32") 

class dataLoader():
    def __init__(self, fn):
        self.dataSet = loadData(fn)
        self.len = len(self.dataSet)
    
    def getData(self, i):
        index = min(i,self.len-1)
        data, target = self.dataSet[index][:-1], self.dataSet[index][-1]
        return np.reshape(data,(len(data),1)), int(target)
            
    def __len__(self):
        return self.len

In [3]:
trainData = dataLoader('hw3_train.dat')
testData = dataLoader('hw3_test.dat')

In [4]:
# question 14
def linearRegressionAlgorithm():
    X, y = trainData.dataSet[:,:-1], trainData.dataSet[:,-1:]
    return np.linalg.pinv(X).dot(y)

def averagedSquareError(w):
    X, y = trainData.dataSet[:,:-1], trainData.dataSet[:,-1:]
    return np.linalg.norm(X.dot(w)-y)**2/len(trainData)

w_opt = linearRegressionAlgorithm()
Ein_opt = averagedSquareError(w_opt)

print('Ein =', Ein_opt)

Ein = 0.6053223755451654
[[ 0.00754448]
 [ 0.14325774]
 [ 0.03548978]
 [ 0.02775754]
 [ 0.02547401]
 [-0.08173502]
 [ 0.0807915 ]
 [-0.1092945 ]
 [ 0.07789521]
 [ 0.1640752 ]
 [ 0.14069183]]


In [5]:
# question 15
def stochasticGradientDescent(lr = 0.001):
    wt = np.zeros((11,1))
    iteration = 0
    while averagedSquareError(wt) > 1.01*Ein_opt:
        xn, yn = trainData.getData(np.random.randint(len(trainData)))
        wt += -lr*2*(wt.transpose().dot(xn)-yn)*xn
        iteration += 1
    return iteration

def mpStochasticGradientDescent(index):
    return stochasticGradientDescent()

pool = mp.Pool(mp.cpu_count())
iterationList = pool.map(mpStochasticGradientDescent, range(100))
print('average:', np.average(iterationList))

average: 2099.51


In [6]:
# question 16
def theta(args, w):
    x, y = args
    return 1/(1+np.exp(-y*w.transpose().dot(x)))

def crossEntropy(wt):
    cossEntropyList = [-np.log(theta(trainData.getData(i), wt)) for i in range(len(trainData))]
    return np.average(cossEntropyList)
        
def stochasticGradientDescent(lr = 0.001, max_ite = 500):
    wt = np.zeros((11,1))
    for i in range(max_ite):
        xn, yn = trainData.getData(np.random.randint(len(trainData)))
        wt += lr*theta((xn,-yn), wt)*yn*xn
    return crossEntropy(wt)

def mpStochasticGradientDescent(index):
    return stochasticGradientDescent()

pool = mp.Pool(mp.cpu_count())
iterationList = pool.map(mpStochasticGradientDescent, range(1000))
print('average:', np.average(iterationList))

average: 0.5686190349471474


In [7]:
# question 17
def theta(args, w):
    x, y = args
    return 1/(1+np.exp(-y*w.transpose().dot(x)))

def crossEntropy(wt):
    cossEntropyList = [-np.log(theta(trainData.getData(i), wt)) for i in range(len(trainData))]
    return np.average(cossEntropyList)
        
def stochasticGradientDescent(lr = 0.001, max_ite = 500):
    wt = w_opt
    for i in range(max_ite):
        xn, yn = trainData.getData(np.random.randint(len(trainData)))
        wt += lr*theta((xn,-yn), wt)*yn*xn
    return crossEntropy(wt)

def mpStochasticGradientDescent(index):
    return stochasticGradientDescent()

pool = mp.Pool(mp.cpu_count())
iterationList = pool.map(mpStochasticGradientDescent, range(1000))
print('average:', np.average(iterationList))

average: 0.44229922


In [8]:
# question 18
def sign(x):
    return 2*(0 < x)-1

def linearRegressionAlgorithm(data):
    X, y = data.dataSet[:,:-1], data.dataSet[:,-1:]
    return np.linalg.pinv(X).dot(y)

def testSample(args,w):
    x, y = args
    return sign(w.transpose().dot(x)) == y

def averagedZeroOneError(data):
    X, y = data.dataSet[:,:-1], data.dataSet[:,-1:]
    w = linearRegressionAlgorithm(trainData)
    logicList = [testSample(data.getData(i),w) for i in range(len(data))]
    return np.average(logicList)

print('ans =', abs(averagedZeroOneError(trainData)-averagedZeroOneError(testData)))

ans = 0.3226666666666667


In [9]:
# question 19
def loadDataNonlinear(filename, q):   
    with open(filename,'r') as file:  
        dataSet = [nonlinearExpension(line[0].split('\t'), q) for line in csv.reader(file)]
    return np.array(dataSet, dtype = "float32") 

def nonlinearExpension(vector, q):
    expandedList = []
    for i in range(q):
        expandedList += [float(x)**(i+1) for x in vector[:-1]]
    return [1]+expandedList+[vector[-1]]
    
class dataLoader():
    def __init__(self, fn, q):
        self.dataSet = loadDataNonlinear(fn, q)
        self.len = len(self.dataSet)
    
    def getData(self, i):
        index = min(i,self.len-1)
        data, target = self.dataSet[index][:-1], self.dataSet[index][-1]
        return np.reshape(data,(len(data),1)), int(target)
            
    def __len__(self):
        return self.len

trainData = dataLoader('hw3_train.dat',3)
testData = dataLoader('hw3_test.dat',3)
print('ans =', abs(averagedZeroOneError(trainData)-averagedZeroOneError(testData)))

ans = 0.3736666666666667


In [10]:
# question 20
trainData = dataLoader('hw3_train.dat',10)
testData = dataLoader('hw3_test.dat',10)
print('ans =', abs(averagedZeroOneError(trainData)-averagedZeroOneError(testData)))

ans = 0.44666666666666666
