# Perceptron and Logistics Regression from Scratch

In [1]:
import numpy as np
import urllib
from urllib.request import urlopen
import pandas as pd
import time 

In [2]:
#helper function to read the file
def read_file(filename):
    temping = []
    with open(filename) as f:
        for line in f:
            temp = list(map(float, line.split(' ')))
            #temp = line.split(' ')
            temping.append(temp)
    return np.array(temping)

In [3]:
def split_label(x):
    temping = []
    temping2 = []
    for i in x:
        temping.append(i[:-1])
        temping2.append(i[-1])
    return np.array(temping),np.array(temping2)

In [4]:
#read the train set, test set, validation set.
train_data = read_file('train_set.txt')
test_data = read_file('test_set.txt')

1. write a linear classifier with perceptron that can predict if a post belongs to class 1 or
class 2. For this purpose, your training data is the subset of train_set.txt that has label 1 or 2, and
your test data is the subset of pa3test.txt that has label 1 or 2.
Assume that data is linearly separable by a hyperplane through the origin. Run two, three and four
passes of perceptron on the training dataset to find classifiers that separate the two classes.


In [5]:
def select_12(data_set):
    temp = data_set.copy().tolist()
    tempx = []
    for i in range(len(temp)):
        if temp[i][-1]==1 or temp[i][-1] == 2:
            tempx.append(temp[i])
    return np.array(tempx)

In [6]:
def change_label(y):
    temp = y.copy()
    for i in range(len(temp)):
        if temp[i] == 2:
            temp[i] = -1
    return temp

In [7]:
x_train, y_train = split_label(select_12(train_data))
x_test, y_test= split_label(select_12(test_data))

In [8]:
new_y_train = change_label(y_train)
new_y_test = change_label(y_test)

In [9]:
def update_w(x_train, y_train,n):
    the_w = np.array([0] * len(x_train[0]))
    for _ in range(0,n):
        for i in range(len(x_train)):
            checker = y_train[i] * np.dot(the_w, x_train[i])
            if checker <= 0:
                the_w = the_w + x_train[i] * y_train[i]
    return the_w

In [10]:
def predict(row,weight):
    activation = np.dot(row,weight)
    if activation >=0:
        return 1
    else: return -1

In [11]:
def prediction_error(x_train,y_train,the_w):
    prediction = []
    for i in x_train:
        prediction.append(predict(i,the_w))
    prediction = np.array(prediction)
    return 1-(sum(prediction == y_train)/len(y_train))

In [12]:
train_error = []
test_error = []
for i in range(2,5):
    the_w = update_w(x_train,new_y_train,i)
    train_error.append(prediction_error(x_train,new_y_train,the_w))
    test_error.append(prediction_error(x_test,new_y_test,the_w))

In [13]:
pd.DataFrame({'# of perceptron':[2,3,4],'train_error':train_error,'test_error':test_error})

Unnamed: 0,# of perceptron,train_error,test_error
0,2,0.03578,0.061008
1,3,0.018349,0.045093
2,4,0.016514,0.045093


2. training data is the subset of train_set.txt
that has label 1 or 2, and your test data is the subset of test_set.txt that has label 1 or 2.
Again, the classifier is a hyperplane through the origin. Starting with the initial point w0 set to the
all zeros vector, run 10, 50 and 100 iterations of gradient descent on the following logistic regression
loss function with learning rate η = 0.001:
L(w) = Xn
i=1
log (1 + e
−yiwT xi
),
where {xi
, yi}
n
i=1 is the dataset, xi ∈ R
d
, yi ∈ {−1, 1} and w ∈ R
d
is the parameter vector. 

In [14]:
def update_w_logistic(x_train, y_train,n):
    the_w = np.array([0] * len(x_train[0]))
    for _ in range(0,n):
        temp = np.array([0] * len(x_train[0]))
        # the loss function
        for i in range(len(x_train)):
            denom = np.exp(y_train[i] * np.dot(the_w, x_train[i]))+1
            nom = y_train[i] * x_train[i]
            loss_function = nom / denom 
            temp = temp + loss_function
        the_w = the_w + 0.001 * temp
    return the_w

In [15]:
def predict_logistic(row,weight):
    activation  = 1 / (1 + np.exp(np.dot(-weight,row)))
    if activation > 0.5:
        return 1
    else: return -1

In [16]:
aa = update_w_logistic(x_train,new_y_train,2)

In [17]:
def prediction_error_logistic(x_train,y_train,the_w):
    prediction = []
    for i in x_train:
        prediction.append(predict_logistic(i,the_w))
    prediction = np.array(prediction)
    return 1-(sum(prediction == y_train)/len(y_train))

In [18]:
prediction_error_logistic(x_train,new_y_train,aa)

  


0.4972477064220183

In [19]:
train_error = []
test_error = []
number_iteration = [10,50,100]
for i in number_iteration:
    the_w = update_w_logistic(x_train,new_y_train,i)
    train_error.append(prediction_error_logistic(x_train,new_y_train,the_w))
    test_error.append(prediction_error_logistic(x_test,new_y_test,the_w))

  
  


In [20]:
pd.DataFrame({'number_iteration':[10,50,100],'train_error':train_error,'test_error':test_error})

Unnamed: 0,number_iteration,train_error,test_error
0,10,0.297248,0.297082
1,50,0.03945,0.061008
2,100,0.020183,0.045093


3.Consider the perceptron classifier w that you built by running three passes on the data. Now
I try to interpret this classifier.
Find the three coordinates in w with the highest and lowest values. What are the words (from
dictionary.txt) that correspond to these coordinates? The three highest coordinates are those
words whose presence indicates the positive class most strongly, and the three lowest coordinates are
those words whose presence indicates the negative class most strongly.


In [21]:
def read_file_dictionary(filename):
    temping = []
    with open(filename) as f:
        for line in f:
            temp =  line.split()
            #temp = line.split(' ')
            temping += temp
    return np.array(temping)

In [22]:
dictionary_array = read_file_dictionary('pa3dictionary.txt')

In [23]:
the_w = update_w(x_train,new_y_train,3)

most positive

In [24]:
dictionary_array[the_w.argsort()[:3]]

array(['he', 'team', 'game'], dtype='<U12')

most negative

In [25]:
dictionary_array[the_w.argsort()[::-1][:3]]

array(['file', 'program', 'line'], dtype='<U12')

4.Repeat Part (3) of the question on the logistic regression classifier that you got after 50 iterations of
gradient descent in part (2).

In [26]:
the_w = update_w_logistic(x_train,new_y_train,50)

  


most positive

In [27]:
dictionary_array[the_w.argsort()[:3]]

array(['he', 'game', 'they'], dtype='<U12')

most negative

In [28]:
dictionary_array[the_w.argsort()[::-1][:3]]

array(['window', 'file', 'use'], dtype='<U12')

5. I will build a one-vs-all multi-class classifier with a Don’t Know option.
For each class i = 1, . . . , 6, run a single pass of the perceptron algorithm on the training dataset to
compute a linear classifier separating the training data points in class i from the training data points
not in class i. Call this classifier Ci
. I will now use these classifiers to construct a one-vs-all multiclass
classifier.
Given a test example x, the one-vs-all classifier predicts as follows. If Ci(x) = i for exactly one
i = 1, . . . , 6, then predict label i. If Ci(x) = i for more than one i in 1, . . . , 6, or if Ci(x) = i for no i,
then report Don’t Know.
I will build a confusion matrix, that indicates how well a multiclass classifier can distinguish between
classes. Recall from lecture that a confusion matrix is a 6×6 matrix, where each row is labelled 1, . . . , 6
and each column is labelled 1, . . . , 6. The entry of the matrix at row i and column j is Cij/Nj where
Cij is the number of test examples that have label j but are classified as label i by the classifier, and
Nj is the number of test examples that have label j. Since the one-vs-all classifier can also predict
Don’t Know, the confusion matrix will now be an 7 × 6 matrix – that is, it will have an extra row
corresponding to the Don’t Know predictions.
Write down the confusion matrix for the one-vs-all classifier on the training data in pa3train.txt
based on the test data in pa3test.txt.
Looking at the confusion matrix, what are the i and j in the following statements?
(a) The perceptron classifier has the highest accuracy for examples that belong to class i.
(b) The perceptron classifier has the least accuracy for examples that belong to class i.
(c) The perceptron classifier most often mistakenly classifies an example in class j as belonging to
class i, for i, j ∈ {1, 2, 3, 4, 5, 6} (i.e., excluding Don’t Know)

In [29]:
x_train, y_train = split_label(train_data)
x_test, y_test= split_label(test_data)

In [30]:
def change_label_muti(y,class1):
    temp = y.copy()
    for i in range(len(temp)):
        if temp[i] == class1:
            temp[i] = 1
        else:
            temp[i] = -1
    return temp

In [31]:
def prediction_error_q5(x_train,y_train,the_w):
    prediction = []
    for i in x_train:
        prediction.append(predict(i,the_w))
    return prediction

In [32]:
total = []
for i in range(1,7):
    new_y_train = change_label_muti(y_train,i)
    new_y_test = change_label_muti(y_test,i)
    the_w = update_w(x_train,new_y_train,1)
    total.append(prediction_error_q5(x_test,new_y_test,the_w))
total = np.array(total)

In [33]:
one_vs_all = []
for i in range(len(total[0])):
    if np.count_nonzero(total[:,i] ==1) ==1:
        one_vs_all.append((np.where(total[:,i] == 1)[0][0])+1)
    else:
        one_vs_all.append(-1)

In [34]:
confusion_matrix = np.zeros(shape = (7,6))

In [35]:
for i in range(len(one_vs_all)):
    if one_vs_all[i] == y_test[i]:
        value = int(one_vs_all[i])
        confusion_matrix[value - 1][value - 1] += 1
    if one_vs_all[i] != y_test[i] and one_vs_all[i] != -1 :
        valuei = int(y_test[i])
        valuej = int(one_vs_all[i])
        confusion_matrix[valuej - 1][valuei - 1] += 1
    if one_vs_all[i] == -1:
        valuei = int(y_test[i])
        valuej = one_vs_all[i]
        confusion_matrix[6][valuei-1] +=1

In [36]:
confusion_matrix

array([[133.,   2.,   6.,   4.,   0.,   0.],
       [  2., 126.,   6.,   5.,   2.,   2.],
       [  0.,   3.,  65.,   0.,   0.,   3.],
       [  3.,   1.,   0., 126.,   0.,   0.],
       [  3.,   6.,  13.,   1., 125.,  13.],
       [  1.,   2.,   6.,   0.,  11.,  54.],
       [ 43.,  52.,  79.,  48.,  18.,  36.]])

In [37]:
df = pd.DataFrame(data=confusion_matrix, index=["1", "2","3", "4","5", "6","dont know"], columns=["1", "2","3", "4","5", "6"])
df

Unnamed: 0,1,2,3,4,5,6
1,133.0,2.0,6.0,4.0,0.0,0.0
2,2.0,126.0,6.0,5.0,2.0,2.0
3,0.0,3.0,65.0,0.0,0.0,3.0
4,3.0,1.0,0.0,126.0,0.0,0.0
5,3.0,6.0,13.0,1.0,125.0,13.0
6,1.0,2.0,6.0,0.0,11.0,54.0
dont know,43.0,52.0,79.0,48.0,18.0,36.0


In [38]:
df / df.sum()

Unnamed: 0,1,2,3,4,5,6
1,0.718919,0.010417,0.034286,0.021739,0.0,0.0
2,0.010811,0.65625,0.034286,0.027174,0.012821,0.018519
3,0.0,0.015625,0.371429,0.0,0.0,0.027778
4,0.016216,0.005208,0.0,0.684783,0.0,0.0
5,0.016216,0.03125,0.074286,0.005435,0.801282,0.12037
6,0.005405,0.010417,0.034286,0.0,0.070513,0.5
dont know,0.232432,0.270833,0.451429,0.26087,0.115385,0.333333


a) The perceptron classifier has the highest accuracy for examples that belong to class 5

b) The perceptron classifier has the least accuracy for examples that belong to class 3

c)The perceptron classifier most often mistakenly classifies an example in class 6 as class 5