In [69]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import os, sys, numpy as np

#Parse data into testing and training examples
def parse_data(line):
    data = []
    
    #If line starts with a quotation mark, then there is a comma in the data and rsplit must be used
    if line.startswith("\""):
        #find the ending quotation mark to create msg substring
        data = line.rsplit(",", 4)
    else:
        data = line.split(",")
    
    msg = data.pop(0)
    quantifiers = data
    return msg, quantifiers
    

def update_data(msg, value, data):
    if msg not in data[value]:
        data[value].append(msg)
    
    

def create_data_file(filename, data, value):
    output = open(filename, "a", encoding="utf8")
    for item in data:
        output.write("%s, %d\n" % (item, value))
    output.close()
    
    
a1_data = defaultdict(list)
a2_data = defaultdict(list)
v1_data = defaultdict(list)
v2_data = defaultdict(list)

data = open("dataset-fb-valence-arousal-anon.csv", encoding="utf8")
#Read and skip first line
data.readline()
corpora = []
scores = [] #this is the score for corpora at the same index, first value in pair is valence, second arousal

#Parse data
for line in data.readlines():
    msg, quantifiers = parse_data(line)
    v1 = quantifiers[0]
    v2 = quantifiers[1]
    a1 = quantifiers[2]
    a2 = quantifiers[3]
    update_data(msg, v1, v1_data)
    update_data(msg, v2, v2_data)
    update_data(msg, a1, a1_data)
    update_data(msg, a2, a2_data)
    corpora.append(msg)
    vAvg = (int(v1)+int(v2))//2
    aAvg = (int(a1)+int(a2))//2
    scores.append((vAvg, aAvg))
    
for value in v1_data.keys():
    data_list = v1_data[value]
    
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/v1_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/v1_testing.csv", test_list, int(value))
    
    
for value in v2_data.keys():
    data_list = v2_data[value]
    
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/v2_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/v2_testing.csv", test_list, int(value))
    
for value in a1_data.keys():
    data_list = a1_data[value]
    
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/a1_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/a1_testing.csv", test_list, int(value))
    
for value in a2_data.keys():
    data_list = a2_data[value]
  
    #split data list in half for training and testing
    length = len(data_list)
    
    train_list = data_list[:length//2]
    create_data_file("Train/a2_training.csv", train_list, int(value))
    
    test_list = data_list[length//2:]
    create_data_file("Test/a2_testing.csv", test_list, int(value))    

In [72]:
vectorizer = CountVectorizer(corpora, min_df=2)
vocab = vectorizer.fit_transform(corpora)
tokens = vectorizer.get_feature_names()
#print(vocab.toarray())
#print(vocab[0])
#print(tokens[2389])
#print(corpora[0])

#initialize some values
vocabA = vocab.toarray()

#randomly initialize weights to start off with 
weights = []
for i in range(len(vectorizer.get_feature_names())):
    weights.append(np.random.rand())

#function to compute dot product
def dot(weights, feature):
    output = 0
    for i in range(len(weights)):
        output += weights[i]*feature[i]
    return output

for i in range(10):
    #calculate model guess
    output = dot(weights, vocabA[i])
    print(vocabA[i])

    print(output)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[0 0 0 ... 0 0 0]
5.888442711112049
[0 0 0 ... 0 0 0]
2.754033784338631
[0 0 0 ... 0 0 0]
1.688728268896487
[0 0 0 ... 0 0 0]
8.712522851123037
[0 0 0 ... 0 0 0]
2.2483798418446033
[0 0 0 ... 0 0 0]
2.5805543605424694
[0 0 0 ... 0 0 0]
4.618608323884309
[0 0 0 ... 0 0 0]
16.81623559067609
[0 0 0 ... 0 0 0]
7.203351178864781
[0 0 0 ... 0 0 0]
2.853933134295421
