In [1]:
!pip install -q torch
!pip install -q evaluate
!pip install -q matplotlib
!pip install -q transformers
!pip install -q huggingface_hub
!pip install -q transformers datasets



import os
import torch
import random
import evaluate
import datasets
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from datasets import list_datasets, load_dataset
from transformers import (
    AdamW,
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    EvalPrediction,
    TrainingArguments,
    Trainer,
)

In [2]:
labels = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral',]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)} 

states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
id2state = {idx:state for idx, state in enumerate(states)}
state2id = {state:idx for idx, state in enumerate(states)} 

black_states = ['AL', 'AR', 'DE', 'FL', 'GA', 'IL', 'LA', 'MD', 'MI', 'MS', 'NC', 'NY', 'SC', 'TN', 'VA']
white_states = ['AK', 'AZ', 'CA', 'CO', 'CT', 'HI', 'ID', 'IN', 'IA', 'KS', 'KY', 'ME', 'MA', 'MN', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SD', 'TX', 'UT', 'VT', 'WA', 'WV', 'WI', 'WY']
democrats = ['Biden', 'Harris', 'Obama', 'Sanders']
republicans = ['Pence', 'Trump']
republicans_results = ['AL', 'AK', 'AR', 'FL', 'ID', 'IN', 'IA', 'KS', 'KY', 'LA', 'MS', 'MO', 'MT', 'NE', 'NC', 'ND', 'OH', 'OK', 'SC', 'SD', 'TN', 'TX', 'UT', 'WV', 'WY']
democrats_results = ['AZ', 'CA', 'CO', 'CT', 'DE', 'GA', 'HI', 'IL', 'ME', 'MD', 'MA', 'MI', 'MN', 'NV', 'NH', 'NJ', 'NM', 'NY', 'OR', 'PA', 'RI', 'VT', 'VA', 'WA', 'WI']

work_dir = "C:/Users/m1500/Documents/Notebook/"
os.chdir(work_dir + "\\data")
work_dir = os.getcwd()
candidates = ['Biden', 'Harris', 'Obama', 'Sanders', 'Pence', 'Trump']

In [3]:
pt_model = AutoModelForSequenceClassification.from_pretrained("IsaacZhy/roberta-large-goemotions", problem_type = "multi_label_classification", num_labels = 28, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained("roberta-large", do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

trainer = Trainer(
    model=pt_model,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [4]:
import re
import csv

sigmoid = torch.nn.Sigmoid()
with open("result.csv", 'w', newline="") as result_file:
    writer = csv.writer(result_file)
    for candidate in candidates:
        texts = os.listdir(work_dir + '/' + candidate)
        texts.sort()
        for text in texts:
            with open(work_dir + '/' + candidate + '/' + text, 'r', encoding = 'utf-8') as txt:
                count = np.zeros(28)
                state = text[len(candidate)+1] + text[len(candidate)+2]
                speech_text = txt.readline()
                sentences = re.split('[!?.]', speech_text)
                for sentence in sentences:
                    encoding = tokenizer(sentence, return_tensors="pt")
                    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
                    outputs = trainer.model(**encoding)
                    logits = outputs.logits               
                    probs = sigmoid(logits.squeeze().cpu())
                    predictions = np.zeros(28)
                    predictions[np.where(probs >= 0.5)] = 1
                    count = count + predictions
                percentage = np.zeros(27)
                for i in range(27):
                    percentage[i] = count[i] / (sum(count) - count[27])
                percentage = np.append(percentage, 1 if state in black_states else 0)
                percentage = np.append(percentage, 0 if (candidate in democrats) ^ (state in democrats_results) else 1)
                writer.writerow(percentage)
            print(text + " done!")
print("Processing over")

Biden_AZ_Oct8.txt done!
Biden_CA_Mar3.txt done!
Biden_DE_Aug13.txt done!
Biden_DE_Jun30.txt done!
Biden_DE_Oct28.txt done!


KeyboardInterrupt: 