In [167]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import os, sys, numpy as np

data = open("dataset-fb-valence-arousal-anon.csv", encoding="utf8")
#Read and skip first line
data.readline()
corpora = []
scores = [] #this is the score for corpora at the same index, first value in pair is valence, second arousal

#Parse data
for line in data.readlines():
    msg, quantifiers = parse_data(line)
    v1 = quantifiers[0]
    v2 = quantifiers[1]
    a1 = quantifiers[2]
    a2 = quantifiers[3]
    update_data(msg, v1, v1_data)
    update_data(msg, v2, v2_data)
    update_data(msg, a1, a1_data)
    update_data(msg, a2, a2_data)
    corpora.append(msg)
    vAvg = (int(v1)+int(v2))//2
    aAvg = (int(a1)+int(a2))//2
    scores.append((vAvg, aAvg))
    
for value in v1_data.keys():
    data_list = v1_data[value]
    
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/v1_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/v1_testing.csv", test_list, int(value))
    
    
for value in v2_data.keys():
    data_list = v2_data[value]
    
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/v2_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/v2_testing.csv", test_list, int(value))
    
for value in a1_data.keys():
    data_list = a1_data[value]
    
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/a1_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/a1_testing.csv", test_list, int(value))
    
for value in a2_data.keys():
    data_list = a2_data[value]
  
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/a2_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/a2_testing.csv", test_list, int(value))  

In [169]:
import re

vectorizer = CountVectorizer(corpora)
vocab = vectorizer.fit_transform(corpora)

#initialize some values
#tensors = vocab.toarray()

def process_file():
    input = []
    word_regx = re.compile('^[a-z]*$')
    for sentence in corpora:
        words = sentence.split(" ")
        for word in words:
            word = re.sub('[.,!?\\-()\"]', "", word).lower()
            match = word_regx.match(word)
            if match != None and word not in input:
                input.append(word)
    return input

def tensorize_sentence(sentence):
    sentence = re.sub('[.,!?\\-()\"]', "", sentence).lower()
    words = sentence.split(" ")
    sentence_tokens = [word for word in words if word in tokens]
    tensor = torch.zeros(len(sentence_tokens), 1, len(tokens))
    for index, word in enumerate(sentence_tokens):
        i = tokens.index(word)
        tensor[index][0][i] = 1
    return tensor

tokens = process_file()

In [176]:
from __future__ import unicode_literals, print_function, division
from io import open
import random
import string
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.activate = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.activate(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

# set values used for training iterations
criterion = nn.NLLLoss()
learning_rate = 0.000001
iterations = 10000
print_freq = 100

# Define words used to identify names
dictionary = {}
n_words = len(dictionary)

# Define categories for naming
s1 = "1"
s2 = "2"
s3 = "3"
s4 = "4"
s5 = "5"
s6 = "6"
s7 = "7"
s8 = "8"
s9 = "9"
categories = [s1, s2, s3, s4, s5, s6, s7, s8, s9]


In [177]:
def train(category_tensor, sentence_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()
    for i in range(sentence_tensor.size()[0]):
        word_tensor = sentence_tensor[i].clone().detach()#torch.tensor(sentence_tensor[i], dtype=torch.long)
        output, hidden = rnn(word_tensor, hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()


def getCategoryFromOutput (output):
    top_n, top_i = output.topk(1)
    category_index = top_i[0].item()
    return categories[category_index], category_index


#Train RNN to clasify valence first
n_hidden = 1
n_categories = len(categories)
n_words = len(tokens)
rnn = RNN(n_words, n_hidden, n_categories)

# Establish loss tracking variables
current_loss = 0
all_losses = []

# Start trainin
print("Begin training model to guess valence scores using training data.")
for i in range(iterations):
    val = random.randint(0, 2316)
    sentence_tensor = tensorize_sentence(corpora[val])
    if sentence_tensor.numel() == 0:
        continue
    category_tensor = torch.tensor([scores[val][0] - 1], dtype=torch.long)
    category = categories[category_tensor[0]]
    output, loss = train(category_tensor, sentence_tensor)
    current_loss += loss
    guess, guess_i = getCategoryFromOutput(output)
    if i % print_freq == 0:
        correct = '✓' if guess == category else '✗'
        print(correct)



Begin training model using training data.
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✓
✗
✗
✗
✗
✗
✗
✗


In [188]:
print("Valence training has completed. Parsing testing data.")
# After training has completed, establish a confusion matrix to determine accuracy
# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)

# Return an evaluation based on the current training data
def classifySentence(sentence_tensor):
    hidden = rnn.initHidden()

    for i in range(sentence_tensor.size()[0]):
        word_tensor = sentence_tensor[i].clone().detach()
        output, hidden = rnn(word_tensor, hidden)
    return getCategoryFromOutput(output)

# Try values at random and record results
print("Creating confusion matrix.")
correct = 0
total = 0
for i in range(2317, 2895):
    sentence_tensor = tensorize_sentence(corpora[i])
    category_tensor = torch.tensor([scores[i][0] - 1], dtype=torch.long)
    category = categories[category_tensor[0]]
    
    if sentence_tensor.numel() == 0:
        continue
    
    guess, guess_i = classifySentence(sentence_tensor)
    category_i = categories.index(category)
    if category_i == guess_i:
        correct += 1
    total += 1
    confusion[category_i][guess_i] += 1

# Normalize by dividing every row by its sum
print("Normalizing data.")
for i in range(n_categories):
    confusion[i] = confusion[i] / confusion[i].sum()

print("\nResults:")
print('The model was able to correctly evaluate valence with an accuracy of %3d%%\n' % (correct/total * 100))

Valence training has completed. Parsing testing data.
Creating confusion matrix.
Normalizing data.

Results:
tensor([[0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.2000, 0.0000, 0.0000, 0.7000, 0.1000],
        [0.0000, 0.2105, 0.1053, 0.0000, 0.2105, 0.0000, 0.0000, 0.3947, 0.0789],
        [0.0000, 0.2151, 0.0968, 0.0000, 0.2151, 0.0000, 0.0000, 0.4194, 0.0538],
        [0.0000, 0.1917, 0.0414, 0.0000, 0.3083, 0.0038, 0.0000, 0.3910, 0.0639],
        [0.0000, 0.2453, 0.0189, 0.0000, 0.2264, 0.0000, 0.0000, 0.4811, 0.0283],
        [0.0000, 0.2400, 0.0400, 0.0000, 0.2800, 0.0000, 0.0000, 0.3800, 0.0600],
        [0.0000, 0.3750, 0.1250, 0.0000, 0.3750, 0.0000, 0.0000, 0.1250, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000]])
The model was able to correctly evaluate arousal with an accuracy of  15%



In [180]:
# Start trainin
rnn = RNN(n_words, n_hidden, n_categories)

print("Begin training model to guess arousal scores using training data.")
for i in range(iterations):
    val = random.randint(0, 2316)
    sentence_tensor = tensorize_sentence(corpora[val])
    if sentence_tensor.numel() == 0:
        continue
    category_tensor = torch.tensor([scores[val][1] - 1], dtype=torch.long)
    category = categories[category_tensor[0]]
    output, loss = train(category_tensor, sentence_tensor)
    current_loss += loss
    guess, guess_i = getCategoryFromOutput(output)
    if i % print_freq == 0:
        correct = '✓' if guess == category else '✗'
        print(correct)

Begin training model to guess arousal scores using training data.
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✓
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✓
✗
✗
✓
✗
✗
✗
✗
✗
✓
✗
✗
✗
✗
✗
✗
✗
✗
✗
✗


In [187]:
print("Valence training has completed. Parsing testing data.")
# After training has completed, establish a confusion matrix to determine accuracy
# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)

# Return an evaluation based on the current training data
def classifySentence(sentence_tensor):
    hidden = rnn.initHidden()

    for i in range(sentence_tensor.size()[0]):
        word_tensor = sentence_tensor[i].clone().detach()
        output, hidden = rnn(word_tensor, hidden)
    return getCategoryFromOutput(output)

# Try values at random and record results
print("Creating confusion matrix.")
correct = 0
total = 0
for i in range(2317, 2895):
    sentence_tensor = tensorize_sentence(corpora[i])
    category_tensor = torch.tensor([scores[i][0] - 1], dtype=torch.long)
    category = categories[category_tensor[0]]
    
    if sentence_tensor.numel() == 0:
        continue
    
    guess, guess_i = classifySentence(sentence_tensor)
    category_i = categories.index(category)
    
    if category_i == guess_i:
        correct += 1
    total += 1
    confusion[category_i][guess_i] += 1

# Normalize by dividing every row by its sum
print("Normalizing data.")
for i in range(n_categories):
    confusion[i] = confusion[i] / confusion[i].sum()

print("\nResults:")
print('The model was able to correctly evaluate arousal with an accuracy of %3d%%\n' % (correct/total * 100))

Valence training has completed. Parsing testing data.
Creating confusion matrix.
Normalizing data.

Results:
The model was able to correctly evaluate arousal with an accuracy of  15%

