Programming assignment 4: Implementing linear classifiers

In [20]:
# import needed libraries for all the tasks
import pandas as pd 
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.model_selection import train_test_split
from cleantext import clean
from string import digits
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm  import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Perceptron
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Normalizer
import random
import time

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

Exercise question

In [2]:
X1 = [{'city':'Gothenburg', 'month':'July'},
      {'city':'Gothenburg', 'month':'December'},
      {'city':'Paris', 'month':'July'},
      {'city':'Paris', 'month':'December'}]
Y1 = ['rain', 'rain', 'sun', 'rain']

X2 = [{'city':'Sydney', 'month':'July'},
      {'city':'Sydney', 'month':'December'},
      {'city':'Paris', 'month':'July'},
      {'city':'Paris', 'month':'December'}]
Y2 = ['rain', 'sun', 'sun', 'rain']

classifier1 = make_pipeline(DictVectorizer(), Perceptron(max_iter=10))
classifier1.fit(X1, Y1)
guesses1 = classifier1.predict(X1)
print(accuracy_score(Y1, guesses1))

classifier2 = make_pipeline(DictVectorizer(), Perceptron(max_iter=10))
classifier2.fit(X2, Y2)
guesses2 = classifier2.predict(X2)
print(accuracy_score(Y2, guesses2))

1.0
0.5


Since the perceptron algorithm is a linear classifier that finds a hyperplane to separate data into different classes. If there is clear separation between the classes, it can perfectly classify the data,same as X1, that has a clear separation between the two classes of "rain" and "sun" based on the month and city. On the other hand, in X2 we see overlapping  between the classes, such as Paris in July, that can be classified as both "sun" and "rain". This overlap means there is no single hyperplane that will perfectly classify all instances. As can be seen below, the accuracy does not imprve by using Linear SVC.

In [3]:
classifier3 = make_pipeline(DictVectorizer(), LinearSVC())
classifier3.fit(X1, Y1)
guesses3 = classifier3.predict(X1)
print(accuracy_score(Y1, guesses3))

classifier4 = make_pipeline(DictVectorizer(), LinearSVC())
classifier4.fit(X2, Y2)
guesses3 = classifier4.predict(X2)
print(accuracy_score(Y2, guesses3))

1.0
0.5


Introduction

In [5]:
class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)

    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])


class Perceptron(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=20):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)

        # Perceptron algorithm:
        for i in range(self.n_iter):
            for x, y in zip(X, Ye):

                # Compute the output score for this instance.
                score = x.dot(self.w)

                # If there was an error, update the weights.
                if y*score <= 0:
                    self.w += y*x


In [6]:
# reading data from corpus file and spliting each line into
# its label and text, and appends them to the Y and X lists
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y


if __name__ == '__main__':
    
    # Read all the documents.
    X, Y = read_data('data/all_sentiment_shuffled.txt')
    
    # Split into training and test parts.
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=0)

    # preprocessing steps
    # 1st: converts the text into a sparse matrix of TF-IDF features
    # 2nd: elects the top 1000 features based on their score
    # 3rd: normalizes the feature vectors to unit length
    # 4th: Using Perception as the Linear Classifier
    pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(),
        Perceptron()  
    )

    # Train the classifier
    pipeline.fit(X_train, Y_train)

    # Evaluate on the test set.
    Y_guess = pipeline.predict(X_test)
    Accuracy = accuracy_score(Y_test, Y_guess)
    print(f"Accuracy for perception is:  {Accuracy}")

Accuracy for perception is:  0.7918590012589173


Implementing the SVC

In [7]:
class SVC(LinearClassifier):
    
    # A straightforward implementation of the perceptron learning algorithm.

    def __init__(self, reg_lambda = 0.001, n_iter=20):
        # n_iter specifies the number of iterations through the training set,chosen by constructor

        self.n_iter = n_iter
        self.reg_lambda = reg_lambda

    def fit(self, X, Y):
        """
        Train a linear classifier using the SVC learning algorithm.
        """
        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)

        for i in range(self.n_iter):
            t = 0
            for x, y in zip(X, Ye):
                t = t + 1
                eta = 1 / (self.reg_lambda * t)
                score = np.dot(x, self.w)
                self.w = (1 - eta * self.reg_lambda) * self.w
                # if misclassification: w is updated using the perceptron update rule
                if score*y < 1:
                    self.w += np.dot(x , eta * y)

In [31]:


# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(),
        SVC()
    )

# Training the classifier an calculating the training time
t0 = time.time()
pipeline.fit(X_train, Y_train)
t1 = time.time()

# Evaluate on the test set.
Y_guess = pipeline.predict(X_test)
Accuracy = accuracy_score(Y_test, Y_guess)

print(f"Accuracy for Linear SCV is:  % {Accuracy * 100}")
print(f"Training time for Linear SCV: {t1 - t0}")

Accuracy for Linear SCV is:  %81.91355434326479
Training time for Linear SCV: 7.603312969207764


Implementing Logistic Regression 

In [17]:
class LogisticRegression(LinearClassifier):
    
    def __init__(self, reg_lambda = 0.0001, n_iter=30):
        # n_iter specifies the number of iterations through the training set,chosen by constructor

        self.n_iter = n_iter
        self.reg_lambda = reg_lambda

    def fit(self, X, Y):
        """
        Train a linear classifier using the logistic regression learning algorithm.
        """
        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)

        for i in range(self.n_iter):
            t = 0
            for x, y in zip(X, Ye):
                t = t + 1
                eta = 1 / (self.reg_lambda * t)
                score = np.dot(x, self.w)
                self.w = (1 - eta * self.reg_lambda) * self.w + y / (1 + np.exp(y * self.w * x)) * x

In [33]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(),
        LogisticRegression()
    )

# Training the classifier and calculating training time
t0 = time.time()
pipeline.fit(X_train, Y_train)
t1 = time.time()

# Evaluate on the test set.
Y_guess = pipeline.predict(X_test)
Accuracy = accuracy_score(Y_test, Y_guess)

print(f"Accuracy for Logistic Regression is: % {Accuracy * 100}")
print(f"Training time for Logistic Regression: {t1 - t0}")

Accuracy for Logistic Regression is: % 83.71800251783466
Training time for Logistic Regression: 2.445495128631592
