# Final Project for CSC 440

US election 2020

## Before we start: check for deps and data paths

Some deps to install

- **jupyter**: install this prior to running on jupyter notebook (otherwise you need to restart the kernel).
- **colab**: just uncomment the `%pip` line

In [2]:
# run this PRIOR to starting jupyter notebook
%pip install -qqq -U transformers[torch] accelerate datasets sentencepiece protobuf==3.20
%pip install -qqq torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

import the things we need

In [3]:
# os commands
import os
import sys
from collections import defaultdict as ddict

# data basics
import numpy as np
import pandas as pd

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# pytorch and huggingface
import torch
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset

# progress bar
from tqdm.auto import tqdm

making sure we are using GPU, or things would be terribly slow.

In [4]:
device = 0 if torch.cuda.is_available() else -1
device

0

check our data already in path

In [5]:
# download the data by curl
# %curl -O some_url

# if already downloaded 
DATA_ROOT = './'

# for google colab, files should be placed under /data/cs440/
# DATA_ROOT = '/content/drive/MyDrive/data/cs440/'

if not os.path.exists(DATA_ROOT):
    print(f'error: {DATA_ROOT} does not exist', file=sys.stderr)
for dirname, _, filenames in os.walk(DATA_ROOT):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./CSC440_Final_Project.ipynb
./desktop.ini
./hashtag_donaldtrump.csv
./hashtag_joebiden.csv


## Reading the data

Huggingface Doc: [datasets.load_dataset](https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.list_datasets)

In [6]:
dataset = load_dataset(
    "csv",
    data_files={
        'trump': f'{DATA_ROOT}hashtag_donaldtrump.csv',
        'biden': f'{DATA_ROOT}hashtag_joebiden.csv'
    },
    lineterminator="\n"
)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating trump split: 0 examples [00:00, ? examples/s]

Generating biden split: 0 examples [00:00, ? examples/s]

In [7]:
dataset

DatasetDict({
    trump: Dataset({
        features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
        num_rows: 970919
    })
    biden: Dataset({
        features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
        num_rows: 776886
    })
})

## Define Pipe Functions

In [11]:
def pipe_and_save(task: str, pipe, dataset):
    # ensuring task output dir exist
    os.makedirs(f'{DATA_ROOT}/{task}', exist_ok=True)
    
    # for each partition in data, get {name: data} pair
    for par_name, par_data in dataset.items():
        print(par_name, par_data)
        
        # collect output from pipeline
        # use batch_size 1024 for 16GB gpu mem
        res = ddict(list)
        for out in tqdm(pipe(KeyDataset(par_data, 'tweet'), 
                             batch_size=1024, 
                             truncation=True, 
                             top_k=3,
                             max_length=128),
                        total=len(par_data)):
            for top_k in out:
                res[top_k['label']].append(top_k['score'])

        # save output to dir
        pd.DataFrame(res).to_csv(f'{DATA_ROOT}/{task}/{par_name}.csv', index=False)

In [14]:
def pipe_and_sample(n_samples: int, pipe, dataset):
    # force pandas to display all rows and all contents inside every column
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.max_rows', None)

    collect = {}
    for par_name, par_data in dataset.items():
        print(par_name, par_data)
        
        # get samples from dataset, seed=42
        samples = KeyDataset(par_data.shuffle(seed=42).select(range(n_samples)), 'tweet')

        res = []
        for i, out in enumerate(tqdm(pipe(samples,
                                          truncation=True, 
                                          max_length=128),
                                     total=n_samples)):
            out['tweet'] = samples[i]
            res.append(out)
        collect[par_name] = pd.DataFrame(res)
    return collect

## Predicting language

In [7]:
TASK = 'lang'
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt, device=device)

In [8]:
pipe_and_save(TASK, pipe, dataset)

trump Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 970919
})


  0%|          | 0/970919 [00:00<?, ?it/s]

biden Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 776886
})


  0%|          | 0/776886 [00:00<?, ?it/s]

In [None]:
#def lang_cls(samples):
#    return {"lang": pipe(samples['tweet'], truncation=True, max_length=128)}
#trump_lang = dataset.map(lang_cls, batched=True)
#use num_proc=16 on cpu to speed this up
#trump_lang['train'].to_csv(f"{DATA_ROOT}hashtag_donaldtrump_lang.csv")

## Sentiment Analysis

In [10]:
task = 'sent'
model_ckpt = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
pipe = pipeline("sentiment-analysis", model=model_ckpt, tokenizer=model_ckpt, device=device)

In [11]:
pipe_and_save(task, pipe, dataset)

trump Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 970919
})


  0%|          | 0/970919 [00:00<?, ?it/s]

biden Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 776886
})


  0%|          | 0/776886 [00:00<?, ?it/s]

## Emotion

In [8]:
task = 'emotion'
model_ckpt = "02shanky/finetuned-twitter-xlm-roberta-base-emotion"
pipe = pipeline("text-classification", model=model_ckpt, device=device)

config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [10]:
pipe_and_save(task, pipe, dataset)

trump Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 970919
})


  0%|          | 0/970919 [00:00<?, ?it/s]

biden Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 776886
})


In [15]:
res = pipe_and_sample(500, pipe, dataset)

trump Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 970919
})


  0%|          | 0/500 [00:00<?, ?it/s]

biden Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 776886
})


  0%|          | 0/500 [00:00<?, ?it/s]

In [16]:
res['trump']

Unnamed: 0,label,score,tweet
0,anger,0.895359,#erdogan and his sister wives #aliyev #donaldtrump and #benjaminnetanyahu \n🐑🤡💩🚽\n#RecognizeArtsakh #SanctionErdogan #SanctionTurkey #SanctionAzerbaijan #sanctionaliyev
1,joy,0.578686,Ok America. This is your day. It's not to late to redeem yourself. Vote the human blimp #Trump out of Office\n\n#Election2020 #ElectionDay #GoVote #Biden2020
2,joy,0.963349,"Hijo menor de #Trump, Barron, dio positivo a #COVID19👉👉👉https://t.co/lT4UycbQLI https://t.co/nlYTdZfiUs"
3,joy,0.552454,Mis hijos festejando. A ver si #Trump acepta la derrota. Mis hijos me preguntan: \n-¿Y el #Senado? \nLa generación que salvará lo que dejemos de éste maltrecho planeta.
4,anger,0.627494,How #Trump and #Bolsonaro broke Latin America´s COVID-19 defenses https://t.co/ywlhFNsrj9
5,joy,0.697675,Australian way of life \n\n#PTEAcademic #Study #Australia #immigration #visa #workpermit #USA #UK #canada #Biden #trump #election #vote #choice #choice #future #life #work #success #inspiration #joy https://t.co/aO4lCxToN1
6,joy,0.879899,ก่อนวันเลือกตั้งประธานาธิบดีสหรัฐครั้งนี้ ทุกโพลชี้คะแนนนิยม Joe Biden เหนือกว่า แต่ล่าสุด Donald Trump ตีตื้นและกล้าประกาศชัยชนะทั้งที่การนับคะแนนยังไม่เสร็จสิ้น..\n#MarketeerOnline #Thepeople #เลือกตั้งประธานาธิบดีสหรัฐ2020 #Trump #Biden #เลือกตั้งสหรัฐฯ2020 #Elections2020 https://t.co/1nSjmLBa3L
7,joy,0.770123,"Un Presidente di transizione, gradito all'establishment del Partito Democratico \nhttps://t.co/MNJNNUMLRa\n#USAelection2020 #BidenHarris #Trump"
8,anger,0.463768,"#Biden takes the lead in Georgia.\n\n#Trump: ""We should bomb on the USSR.""\n\n#Election2020"
9,anger,0.96431,@MarkMeadows Now do the #Trump kids.\n\nI'll wait.


In [17]:
res['biden']

Unnamed: 0,label,score,tweet
0,joy,0.977317,MANA MANA MANA!\n\nMake\nAmerica\nNormal \nAgain!\n\n#Biden \n#USElection \n#Elections2020
1,joy,0.790871,#JoeBiden ..... https://t.co/0YrYK8ZCfH
2,anger,0.99569,"@PresidentRuvi If there was a #USA administration hostile to #Israel it was #Obama #Biden .... we'll see with #BidenHarris. It will almost certainly no longer be the unconditional support, no ifs and buts of the #Trump administration."
3,joy,0.998503,"#Election2020 @JoeBiden and @KamalaHarris win and for the first time, a woman will be Vice-president of US🇺🇸👏🥳 Congrats. Happy for all my friends in #US. Now the world can look forward to a more constructive relationship with your great country. #Biden #Trump @TheDemocrats https://t.co/GYr1mPzVLU"
4,anger,0.948419,"Un vecchio corrotto e imbroglione è il nuovo presidente degli Stati Uniti! Hanno fatto fuori un presidente scomodo per fare i loro comodi! La Sconfitta"" di #Trump è la sconfitta del popolo il quale non conta più niente! #Trump2020 #Biden #Fake"
5,anger,0.984465,"Strategic mistake Republican party. Never should've stuck with this ding bat. \n\nNow, you will pay the price.\n\n@GOP \n\n#JoeBiden"
6,anger,0.336361,#Biden-Capitan America trionfa su #Trump-Thanos: il video che fa impazzire i vip Usa https://t.co/WNmBPYnhZ6 https://t.co/03xnLvNtxW
7,anger,0.78053,No comment: Some world leaders silent on Biden win #Biden #USElectionResults2020 \n\nhttps://t.co/s79lZ57Prs
8,fear,0.869998,"Yo creo que la corte suprema tiene que ver una opción de neutralidad para las elecciones de Estados Unidos o nuevas votaciones, sin correo electrónicos.\nY creo que fue muy raro el conteo rápido de los votos del día miércoles. \n#Election2020 @realDonaldTrump and #Biden https://t.co/k0vDQwZrR5"
9,joy,0.991253,"A Big congratulations @KamalaHarris 👏👏 For the first time in the history of 244 year American Presidential Elections, a lady has been elected as Vice President !!\n\n#KamalaHarris #USElections #JoeBiden #JOEBIDEN2020 #JoeBidenKamalaHarris2020 https://t.co/xZe6qw5VUO"


## Stance detection

(currently only working for english tweets)

In [18]:
candidate = 'trump'
model_ckpt = f"kornosk/bert-election2020-twitter-stance-{candidate}-KE-MLM"
pipe = pipeline("text-classification", model=model_ckpt, device=device)
pipe_and_save(f"stance-{candidate}", pipe, dataset)

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
res = pipe_and_sample(500, pipe, dataset)

trump Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 970919
})


  0%|          | 0/500 [00:00<?, ?it/s]

biden Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 776886
})


  0%|          | 0/500 [00:00<?, ?it/s]

In [20]:
res['trump']

Unnamed: 0,label,score,tweet
0,LABEL_2,0.758162,#erdogan and his sister wives #aliyev #donaldtrump and #benjaminnetanyahu \n🐑🤡💩🚽\n#RecognizeArtsakh #SanctionErdogan #SanctionTurkey #SanctionAzerbaijan #sanctionaliyev
1,LABEL_1,0.861713,Ok America. This is your day. It's not to late to redeem yourself. Vote the human blimp #Trump out of Office\n\n#Election2020 #ElectionDay #GoVote #Biden2020
2,LABEL_2,0.796699,"Hijo menor de #Trump, Barron, dio positivo a #COVID19👉👉👉https://t.co/lT4UycbQLI https://t.co/nlYTdZfiUs"
3,LABEL_2,0.719681,Mis hijos festejando. A ver si #Trump acepta la derrota. Mis hijos me preguntan: \n-¿Y el #Senado? \nLa generación que salvará lo que dejemos de éste maltrecho planeta.
4,LABEL_2,0.751148,How #Trump and #Bolsonaro broke Latin America´s COVID-19 defenses https://t.co/ywlhFNsrj9
5,LABEL_2,0.792715,Australian way of life \n\n#PTEAcademic #Study #Australia #immigration #visa #workpermit #USA #UK #canada #Biden #trump #election #vote #choice #choice #future #life #work #success #inspiration #joy https://t.co/aO4lCxToN1
6,LABEL_2,0.717523,ก่อนวันเลือกตั้งประธานาธิบดีสหรัฐครั้งนี้ ทุกโพลชี้คะแนนนิยม Joe Biden เหนือกว่า แต่ล่าสุด Donald Trump ตีตื้นและกล้าประกาศชัยชนะทั้งที่การนับคะแนนยังไม่เสร็จสิ้น..\n#MarketeerOnline #Thepeople #เลือกตั้งประธานาธิบดีสหรัฐ2020 #Trump #Biden #เลือกตั้งสหรัฐฯ2020 #Elections2020 https://t.co/1nSjmLBa3L
7,LABEL_2,0.779654,"Un Presidente di transizione, gradito all'establishment del Partito Democratico \nhttps://t.co/MNJNNUMLRa\n#USAelection2020 #BidenHarris #Trump"
8,LABEL_2,0.744859,"#Biden takes the lead in Georgia.\n\n#Trump: ""We should bomb on the USSR.""\n\n#Election2020"
9,LABEL_2,0.717609,@MarkMeadows Now do the #Trump kids.\n\nI'll wait.


In [21]:
res['biden']

Unnamed: 0,label,score,tweet
0,LABEL_2,0.519615,MANA MANA MANA!\n\nMake\nAmerica\nNormal \nAgain!\n\n#Biden \n#USElection \n#Elections2020
1,LABEL_2,0.714601,#JoeBiden ..... https://t.co/0YrYK8ZCfH
2,LABEL_1,0.371001,"@PresidentRuvi If there was a #USA administration hostile to #Israel it was #Obama #Biden .... we'll see with #BidenHarris. It will almost certainly no longer be the unconditional support, no ifs and buts of the #Trump administration."
3,LABEL_2,0.54692,"#Election2020 @JoeBiden and @KamalaHarris win and for the first time, a woman will be Vice-president of US🇺🇸👏🥳 Congrats. Happy for all my friends in #US. Now the world can look forward to a more constructive relationship with your great country. #Biden #Trump @TheDemocrats https://t.co/GYr1mPzVLU"
4,LABEL_0,0.827349,"Un vecchio corrotto e imbroglione è il nuovo presidente degli Stati Uniti! Hanno fatto fuori un presidente scomodo per fare i loro comodi! La Sconfitta"" di #Trump è la sconfitta del popolo il quale non conta più niente! #Trump2020 #Biden #Fake"
5,LABEL_2,0.562948,"Strategic mistake Republican party. Never should've stuck with this ding bat. \n\nNow, you will pay the price.\n\n@GOP \n\n#JoeBiden"
6,LABEL_2,0.786664,#Biden-Capitan America trionfa su #Trump-Thanos: il video che fa impazzire i vip Usa https://t.co/WNmBPYnhZ6 https://t.co/03xnLvNtxW
7,LABEL_2,0.727549,No comment: Some world leaders silent on Biden win #Biden #USElectionResults2020 \n\nhttps://t.co/s79lZ57Prs
8,LABEL_2,0.716009,"Yo creo que la corte suprema tiene que ver una opción de neutralidad para las elecciones de Estados Unidos o nuevas votaciones, sin correo electrónicos.\nY creo que fue muy raro el conteo rápido de los votos del día miércoles. \n#Election2020 @realDonaldTrump and #Biden https://t.co/k0vDQwZrR5"
9,LABEL_1,0.627392,"A Big congratulations @KamalaHarris 👏👏 For the first time in the history of 244 year American Presidential Elections, a lady has been elected as Vice President !!\n\n#KamalaHarris #USElections #JoeBiden #JOEBIDEN2020 #JoeBidenKamalaHarris2020 https://t.co/xZe6qw5VUO"
