# Final Project for CSC 440

US election 2020

## Before we start: check for deps and data paths

Some deps to install

- **jupyter**: install this prior to running on jupyter notebook (otherwise you need to restart the kernel).
- **colab**: just uncomment the `%pip` line

In [None]:
# google colab
# %pip install datasets -q

In [1]:
# run this PRIOR to starting jupyter notebook
# pip3 install -q datasets
# pip3 install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# jupyter notebook

import the things we need

In [2]:
# os commands
import os
import sys

# data basics
import numpy as np
import pandas as pd

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# pytorch and huggingface
import torch
from torch.utils.data import Dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset, Dataset

# progress bar
from tqdm.auto import tqdm

making sure we are using GPU, or things would be terribly slow.

In [3]:
device = 0 if torch.cuda.is_available() else -1
device

0

check our data already in path

In [4]:
# download the data by curl
# %curl -O some_url

# if already downloaded 
DATA_ROOT = './'

# for google colab, files should be placed under /data/cs440/
# DATA_ROOT = '/content/drive/MyDrive/data/cs440/'

if not os.path.exists(DATA_ROOT):
    print(f'error: {DATA_ROOT} does not exist', file=sys.stderr)
for dirname, _, filenames in os.walk(DATA_ROOT):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./CSC440_Final_Project.ipynb
./desktop.ini
./hashtag_donaldtrump.csv
./hashtag_joebiden.csv
./.ipynb_checkpoints\CSC440_Final_Project-checkpoint.ipynb


## Reading the data

Huggingface Doc: [datasets.load_dataset](https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.list_datasets)

In [5]:
dataset = load_dataset(
    "csv",
    data_files={
        'trump': f'{DATA_ROOT}hashtag_donaldtrump.csv',
        'biden': f'{DATA_ROOT}hashtag_joebiden.csv'
    },
    lineterminator="\n"
)

In [6]:
dataset

DatasetDict({
    trump: Dataset({
        features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
        num_rows: 970919
    })
    biden: Dataset({
        features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
        num_rows: 776886
    })
})

## Predicting language

In [7]:
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt, device=device)

In [8]:
TASK = 'lang'
os.makedirs(f'{DATA_ROOT}/{TASK}', exist_ok=True)
for par_name, par_data in dataset.items():
    print(par_name, par_data)
    res = []
    for out in tqdm(pipe(KeyDataset(par_data, 'tweet'), batch_size=1024,
                         truncation=True, max_length=128),
                    total=len(par_data)):
        res.append(out['label'])
    pd.Series(res, name=TASK).to_csv(f'{DATA_ROOT}/{TASK}/{par_name}.csv', index=False)

trump Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 970919
})


  0%|          | 0/970919 [00:00<?, ?it/s]

biden Dataset({
    features: ['created_at', 'tweet_id', 'tweet', 'likes', 'retweet_count', 'source', 'user_id', 'user_name', 'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location', 'lat', 'long', 'city', 'country', 'continent', 'state', 'state_code', 'collected_at'],
    num_rows: 776886
})


  0%|          | 0/776886 [00:00<?, ?it/s]

In [None]:
#def lang_cls(samples):
#    return {"lang": pipe(samples['tweet'], truncation=True, max_length=128)}
#trump_lang = dataset.map(lang_cls, batched=True)
#use num_proc=16 on cpu to speed this up
#trump_lang['train'].to_csv(f"{DATA_ROOT}hashtag_donaldtrump_lang.csv")

## Sentiment Analysis

In [9]:
model_ckpt = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
pipe = pipeline("sentiment-analysis", model=model_ckpt, tokenizer=model_ckpt, device=device)

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

ValueError: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

In [None]:
TASK = 'sent'
os.makedirs(f'{DATA_ROOT}/{TASK}', exist_ok=True)
for par_name, par_data in dataset.items():
    print(par_name, par_data)
    res = []
    for out in tqdm(pipe(KeyDataset(par_data, 'tweet'), batch_size=1024,
                         truncation=True, max_length=128),
                    total=len(par_data)):
        res.append(out['label'])
    pd.Series(res, name=TASK).to_csv(f'{DATA_ROOT}/{TASK}/{par_name}.csv', index=False)