In [1]:
#===============================
# Translation from pure python to notebook for easier read!
#===============================
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

import torch
from torch.autograd import Variable
from torch import optim

In [2]:
# ====================
# HW - Predict the centiment of the product with Logistic Regression! 
# (Using pytorch!)
# 
# Using product review dataset for sentiment analysis:
# http://people.mpi-inf.mpg.de/~smukherjee/data/
#
# Based on the example:
# http://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#sphx-glr-beginner-nlp-deep-learning-tutorial-py)
# ====================

In [3]:
def build_model(input_dim, output_dim):
    # We don't need the softmax layer here since CrossEntropyLoss already
    # uses it internally.
    model = torch.nn.Sequential()
    model.add_module("linear", torch.nn.Linear(input_dim, output_dim, bias=False))
    # Output -> sigmoid 
    model.add_module("output", torch.nn.Sigmoid())
    return model

In [4]:
def train(model, loss, optimizer, x_val, y_val):
    x = Variable(x_val, requires_grad=False)
    y = Variable(y_val, requires_grad=False)
    # Reset gradient
    optimizer.zero_grad()
    # Forward
    fx = model.forward(x)
    output = loss.forward(fx, y)
    # Backward
    output.backward()
    # Update parameters
    optimizer.step()
    return output.data[0]

def predict(model, x_val):
    x = Variable(x_val, requires_grad=False)
    output = model.forward(x)
    return output.data.numpy().argmax(axis=1)

In [5]:
# Load data and stem it
# This part could been done better, eh!

def stemMyStrings(words):
    stemmer = PorterStemmer()  # SnowballStemmer('english')
    newListSAS = []
    for word in words:
        newListSAS.append(stemmer.stem(word))
    return "".join(newListSAS)

def load_data():
    rTokenizer = RegexpTokenizer('\w+')
    path = "Dataset2.txt"
    Reviews = []
    # Read review and return tuples of ('Comment', 'sentiment = pos | neg')
    with open(path) as f:
        for line in f.readlines():
            review = line.split('$')
            tokenized = rTokenizer.tokenize(review[2].lower().strip())
            stemmed = stemMyStrings(" ".join(tokenized))

            review[2] = " ".join(tokenized)
            Reviews.append((stemmed, review[1]))
    return Reviews

In [6]:
# Turn a word into BOW -presentation
def make_bow_vector(sentence, word_to_ix):
    vec = np.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec

# Target ID of positive or negative comment
def make_target(label):
    label_to_ix = {"neg": 0, "pos": 1}
    return label_to_ix[label.strip()]

In [7]:
def main():
    torch.manual_seed(42)
    all_data = load_data()
    # Split data to training and testing!
    offset = int(0.8 * len(all_data))
    train_data = [(all_data[tmp][0].split(), all_data[tmp][1]) for tmp in range(0, offset)]
    test_data =  [(all_data[tmp][0].split(), all_data[tmp][1]) for tmp in range(offset + 1, len(all_data))]
    
    # Make a mapping from word to a number! (BOW-presentation)
    word_to_ix = {}
    for sent, _ in train_data + test_data:
        for word in sent:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    # Words are then features
    n_features = len(word_to_ix)
    n_classes = 2
    
    # Make sentances to vectors from (BOW) - initialize torch vectors
    train_X = np.array([make_bow_vector(data[0], word_to_ix) for data in train_data])
    test_X = np.array([make_bow_vector(data[0], word_to_ix) for data in test_data])
    train_Y = np.array([make_target(data[1]) for data in train_data])
    test_Y = np.array([make_target(data[1]) for data in test_data])
    train_X = torch.from_numpy(train_X).float()
    test_X = torch.from_numpy(test_X).float()
    train_Y = torch.from_numpy(train_Y).long()
    
    # Make model
    model = build_model(n_features, n_classes)
    loss = torch.nn.CrossEntropyLoss(size_average=True)
    optimizer = optim.SGD(model.parameters(), lr=0.025, momentum=0.9)
    batch_size = 50
    num_batches = len(train_data) // batch_size
    # Loop num of epochs and batches
    for epoch in range(200):
        cost = 0.
        for k in range(num_batches):
            start, end = k * batch_size, (k + 1) * batch_size
            cost += train(model, loss, optimizer, train_X[start:end], train_Y[start:end])
            
        # Test
        pred_Y = predict(model, test_X)
        if (epoch + 1) % 20 == 0:
            print("Epoch %d, cost = %f, acc = %.2f%%" % (epoch + 1, cost / 10, 100. * np.mean(pred_Y == test_Y)))
        
main()

Epoch 20, cost = 3.487330, acc = 71.15%
Epoch 40, cost = 3.303030, acc = 73.24%
Epoch 60, cost = 3.194058, acc = 74.41%
Epoch 80, cost = 3.116109, acc = 75.85%
Epoch 100, cost = 3.054919, acc = 76.11%
Epoch 120, cost = 3.004274, acc = 76.50%
Epoch 140, cost = 2.961023, acc = 76.24%
Epoch 160, cost = 2.923334, acc = 76.63%
Epoch 180, cost = 2.889997, acc = 76.63%
Epoch 200, cost = 2.860154, acc = 76.37%
