In [None]:
# load debate data
import csv
data = (open('resources/QEC Parliament.csv', 'r', encoding='mac_roman'))

### Lexicon approach

In [None]:
# load sentiment lexicon
sentiwordnet = open('resources/SentiWordNet_3.0.0.txt').read().split('\n')

In [None]:
from collections import OrderedDict

debates = csv.reader(data)
data.seek(0) # reset csv reader

# get nested dict with lists of statements by party for each date
debates_over_time = {}
for row in debates:
    if row[-2] not in debates_over_time:
        debates_over_time[row[-2]] = OrderedDict()
    if row[0] not in debates_over_time[row[-2]]:
        debates_over_time[row[-2]][row[0]] = [row[-1]]
    else:
        debates_over_time[row[-2]][row[0]].append(row[-1])

In [None]:
import spacy # pos tagging
pos = spacy.load('en_core_web_sm')

pos_dict = {'v': 'VER', 'n': 'NOU', 'r': 'ADV', 'a': 'ADJ'} # translate between sentiwordnet and spacy pos tag schema

print('Date      | Score | Score with POS tagging')

# for each party, for each day/debate, print sentiment score 
for key, val in debates_over_time.items():
    print('\n',key)
    for k, v in val.items():
        score, score_pos = 0.0, 0.0
        for statement in v:
            for word in pos(statement):
                # look up word in lexicon
                for line in sentiwordnet[:10]:
                    if str(word).lower() == line.split()[4].split('#')[0]:
                        score += float(line.split()[2]) - float(line.split()[3])
                        if pos_dict[line.split()[0]] == word.pos_[:3]:
                            score_pos += float(line.split()[2]) - float(line.split()[3])
        print(k, score, score_pos)

### Machine learning approach

In [None]:
# format needed = id,title,motion,manual motion,govt/opp motion,motion party affiliation,utt1,utt2,utt3,utt4,utt5,
# manual speech,vote speech,party affiliation,name,rebellion %

debates = csv.reader(data)
data.seek(0) # reset csv reader

debates_dict = {}

for row in debates:
    date = row[0].split('/')[2] + row[0].split('/')[1] + row[0].split('/')[0]
    if date + ' ' + row[3] not in debates_dict:
        debates_dict[date + ' ' + row[3]] = ['','','', '',
                                             '', row[-1], '', '', '', '',
                                             '', '', row[5], row[3], 0]
    else:
        for i in range(6,10):
            if debates_dict[date + ' ' + row[3]][i] == '':
                debates_dict[date + ' ' + row[3]][i] = row[-1]
                break

debates_dict2 = {}
                
for k, v in debates_dict.items():
    debates_dict2[k] = k + ', ' + str(v)[1:-1]

In [None]:
# load pretrained ML model
import pickle
with open('pretrainedmodel.pkl', 'rb') as fin:
    vectorizer, SVM = pickle.load(fin)

In [None]:
for key, val in debates_over_time.items():
    print(key)
    for k, v in val.items():
        speech = ''
        for i in v:
            speech += i + ' '
        X_new = vectorizer.transform([speech])
        print(k, SVM.predict(X_new)[0])
    print('\n')

In [None]:
X_new = vectorizer.transform(["approve great fantastic love", "condemn disgrace awful disappointing"])

In [None]:
print(SVM.predict(X_new))