In [1]:
import random
import project1 as p1
import utils
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation, digits

In [2]:
train_data = utils.load_data('reviews_train.tsv')
val_data = utils.load_data('reviews_val.tsv')
test_data = utils.load_data('reviews_test.tsv')

train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data))
test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data))

dictionary = p1.bag_of_words(train_texts)

train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary)
val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary)
test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary)

In [3]:
def get_order(n_samples):
    try:
        with open(str(n_samples) + '.txt') as fp:
            line = fp.readline()
            return list(map(int, line.split(',')))
    except FileNotFoundError:
        random.seed(1)
        indices = list(range(n_samples))
        random.shuffle(indices)
        return indices


In [4]:
def hinge_loss_single(feature_vector, label, theta, theta_0):
    y = theta @ feature_vector + theta_0
    return max(0, 1 - y * label)

In [5]:
def hinge_loss_full(feature_matrix, labels, theta, theta_0):
    
    ys = feature_matrix @ theta + theta_0
    loss = np.maximum(1 - ys * labels, 0.00)
    return np.mean(loss)


In [6]:
def perceptron_single_step_update(
        feature_vector,
        label,
        current_theta,
        current_theta_0):
   
    if label * (np.dot(current_theta, feature_vector) + current_theta_0) <= 1e-7:
        return current_theta + label * feature_vector, current_theta_0 + label
    return current_theta, current_theta_0


In [7]:
def perceptron(feature_matrix, labels, T):

    nsamples, nfeatures = feature_matrix.shape
    theta = np.zeros(nfeatures)
    theta_0 = 0.0
    for t in range(T):
        for i in get_order(nsamples):
            theta, theta_0 = perceptron_single_step_update(
                feature_matrix[i], labels[i], theta, theta_0)
    return theta, theta_0


In [8]:
def average_perceptron(feature_matrix, labels, T):
  
    nsamples, nfeatures = feature_matrix.shape
    theta = np.zeros(nfeatures)
    theta_sum = np.zeros(nfeatures)
    theta_0 = 0.0
    theta_0_sum = 0.0

    for t in range(T):
        for i in get_order(nsamples):
            theta, theta_0 = perceptron_single_step_update(feature_matrix[i], labels[i], theta, theta_0)
            theta_sum += theta
            theta_0_sum += theta_0
    return theta_sum/(nsamples*T), theta_0_sum/(nsamples*T)

In [9]:
def pegasos_single_step_update(
        feature_vector,
        label,
        L,
        eta,
        current_theta,
        current_theta_0):
   
    rule = 1 - eta * L
    if label * (np.dot(feature_vector, current_theta) + current_theta_0) <= 1:
        return (rule * current_theta) + (eta * label * feature_vector), current_theta_0 + (eta * label)
    return rule * current_theta, current_theta_0


In [10]:
def pegasos(feature_matrix, labels, T, L):
  
    nsamples, nfeatures = feature_matrix.shape
    theta = np.zeros(nfeatures)
    theta_0 = 0
    count = 0

    for t in range(T):
        for i in get_order(nsamples):
            count += 1
            eta = 1.0/np.sqrt(count)
            theta,theta_0 = pegasos_single_step_update(feature_matrix[i], labels[i], L, eta, theta, theta_0)
    return theta, theta_0


In [11]:
def classify(feature_matrix, theta, theta_0):
  
    return (feature_matrix @ theta + theta_0 > 1e-7) * 2.0 - 1


In [12]:
def classifier_accuracy(
        classifier,
        train_feature_matrix,
        val_feature_matrix,
        train_labels,
        val_labels,
        **kwargs):

    theta, theta_0 = classifier(train_feature_matrix, train_labels, **kwargs)
    train_predictions = classify(train_feature_matrix, theta, theta_0)
    val_predictions = classify(val_feature_matrix, theta, theta_0)
    train_accuracy = accuracy(train_predictions, train_labels)
    validation_accuracy = accuracy(val_predictions, val_labels)
    return (train_accuracy, validation_accuracy)


In [13]:
def extract_words(input_string):
   
    for c in punctuation + digits:
        input_string = input_string.replace(c, ' ' + c + ' ')

    return input_string.lower().split()


In [14]:
def bag_of_words(texts):
    dictionary = {} # maps word to unique index
    for text in texts:
        word_list = extract_words(text)
        for word in word_list:
            if word not in dictionary:
                dictionary[word] = len(dictionary)
    return dictionary

In [15]:
def extract_bow_feature_vectors(reviews, dictionary):

    num_reviews = len(reviews)
    feature_matrix = np.zeros([num_reviews, len(dictionary)])

    for i, text in enumerate(reviews):
        word_list = extract_words(text)
        for word in word_list:
            if word in dictionary:
                feature_matrix[i, dictionary[word]] = 1
    return feature_matrix

In [16]:
def accuracy(preds, targets):

    return (preds == targets).mean()
