**By: Jacob van Steyn, ...**

# DS3010 Project : Twitter political analysis


In [None]:
%%capture
!pip install tweepy pymongo alive-progress pandas matplotlib python-dotenv fastparquet

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# VARIABLES
SHOULD_CLEAR_DB = False
SHOULD_SCRAPE = False


In [None]:
#----------------------------------------------
import pymongo
import os
import pprint
from dotenv import load_dotenv

# Using dotenv (loads from a file called '.env') to keep secrets safe
load_dotenv("/content/drive/MyDrive/DS 3010 - Group Project/.env")
MONGO_USERNAME = os.getenv("MONGO_USERNAME")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
MONGO_URL = os.getenv("MONGO_URL")

client = pymongo.MongoClient(f"mongodb+srv://{MONGO_USERNAME}:{MONGO_PASSWORD}@{MONGO_URL}/?retryWrites=true&w=majority")

dblist = client.list_database_names()
if "DS3010" in dblist:
    print("The database already exists!")
else:
    print("Database was created.")

# As mongodb won't create the database until a document is created, let's insert & delete a fake one!
db = client.DS3010
dem_tweets = db.dem_tweets
rep_tweets = db.rep_tweets

if SHOULD_CLEAR_DB:
  document = dem_tweets.insert_one({"test":"temp document!"})
  print(f"Num Tweets Stored: {dem_tweets.count_documents({})}")
  document = rep_tweets.insert_one({"test":"temp document!"})
  print(f"Num Tweets Stored: {rep_tweets.count_documents({})}")

  # Clear DB
  dem_tweets.delete_many({})
  rep_tweets.delete_many({})
  print(f"Num Tweets Stored: {dem_tweets.count_documents({})}")
  print(f"Num Tweets Stored: {rep_tweets.count_documents({})}")




The database already exists!


In [None]:
#----------------------------------------------
import json
import time
import re

import tweepy
from alive_progress import alive_bar
import pandas as pd
import matplotlib.pyplot as plt


# Initialize our tweepy client
def setup():
    # Twitter API key and secret
    
    consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
    consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
    
    # Handling authentication with Twitter
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    # Create a wrapper for the Twitter API
    api = tweepy.API(auth, wait_on_rate_limit=True)
    return api
    
# Helper function for handling handling rate limits and getting each page of results
    # Use alive_bar to display a progress bar as tweets are fetched
def limit_handler(cursor, total):
    with alive_bar(total, force_tty=True) as bar:
        while True:
            try:
                yield cursor.next()
                bar()
            except StopIteration:
                if (bar.current != total):
                    print(f"[Stopped!] Found {bar.current} tweets for specifed term!")
                break
                
api = setup()

In [None]:
# Read accounts from CSV
import csv

user_map = {}
with open("/content/drive/MyDrive/DS 3010 - Group Project/accounts.csv", "r") as file:
  # reading the CSV file
  csvFile = csv.reader(file)
 
  # displaying the contents of the CSV file
  for lines in csvFile:
        if lines[0] and lines[1]:
          user_map[lines[0][1:]] = lines[1] == "D"

print(len(user_map))

390


In [None]:
from tweepy.parsers import TweepError
# Search term (omitting retweets)
count = 270

print("Setup! Retrieving Tweets now...")

# Search for tweets using Tweepy 
# Have first 45

deleted_users = []

if SHOULD_SCRAPE:
  for i, (user, val) in enumerate(user_map.items()):
    print(f"User: {user}, Index: {i}, isDem? {val}")
    search = limit_handler(tweepy.Cursor(api.user_timeline,
                            screen_name=user,
                            exclude_replies=True,
                            include_rts=False,
                            count=90).items(count), count)
    
    returned_tweets = []
    try: 
      for result in search:
          returned_tweets.append(result._json)

      if val:
        print(f"Inserting Tweets to MongoDB...\nCurrent # Dem Tweets Stored: {dem_tweets.count_documents({})}")
        dem_tweets.insert_many(returned_tweets)
        print(f"New Dem Tweets Stored: {dem_tweets.count_documents({})}")
      else:
        print(f"Inserting Tweets to MongoDB...\nCurrent # Rep Tweets Stored: {rep_tweets.count_documents({})}")
        rep_tweets.insert_many(returned_tweets)
        print(f"New Dem Tweets Stored: {rep_tweets.count_documents({})}")
    except TweepError:
      deleted_users.append(user)
      print("USER NOT FOUND!")
      continue
else:
  print("Not Scraping!")
  
  

# Process the results from the search using Tweepy




Setup! Retrieving Tweets now...
User: BarackObama, Index: 0, isDem? True
|████████████████████████████████████████| 270/270 [100%] in 2.4s (101.54/s)                                            
Inserting Tweets to MongoDB...
Current # Dem Tweets Stored: 0
New Dem Tweets Stored: 270
User: realDonaldTrump, Index: 1, isDem? False
|████████████████████████████████████████| 270/270 [100%] in 1.5s (165.20/s)                                            
Inserting Tweets to MongoDB...
Current # Rep Tweets Stored: 0
New Dem Tweets Stored: 270
User: SpeakerPelosi, Index: 2, isDem? True
|████████████████████████████████████████| 270/270 [100%] in 1.3s (200.00/s)                                            
Inserting Tweets to MongoDB...
Current # Dem Tweets Stored: 270
New Dem Tweets Stored: 540
User: RepAndyBiggsAZ, Index: 3, isDem? False
|████████████████████████████████████████| 270/270 [100%] in 2.0s (138.94/s)                                            
Inserting Tweets to MongoDB...
Current #

In [None]:
#----------------------------------------------

# Deleted accounts: ['RepKClark', 'WhipClyburn', 'RepTomEmmer', 'o official', 'RepKiggans', 'RepLuttrell_TX8', 'teammoulton', 'RepMullin']
#missing_accounts = db.missing_accounts
#print(deleted_users)
#missing_accounts.insert_many({usr: user_map[usr] for usr in missing_accounts})

# Get total number of tweets

# Save tweet text into republican and democratic lists
import pandas as pd
from datetime import datetime

file_prefix = "/content/drive/MyDrive/DS 3010 - Group Project/senators_"

df_train = pd.read_parquet(f"{file_prefix}train.parquet", engine='fastparquet')
df_test = pd.read_parquet(f"{file_prefix}test.parquet", engine='fastparquet')
df = df_train.append(df_test)


parsed_tweets = db.parsed_tweets
formatted = []

def get_tweets(database, is_dem):
  with alive_bar(database.count_documents({}), force_tty=True) as bar:
    for tweet in database.find().sort([('$natural', 1)]):
      date = datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y")
      formatted.append([date.strftime("%Y-%m-%d %H:%M:%S"), tweet['id'], tweet['user']['screen_name'], tweet['text'], "Democrat" if is_dem else "Republican", 1 if is_dem else 0])
      bar()

print("Retrieving Dem tweets from Mongodb...")
get_tweets(dem_tweets, True)
print("Retrieving Dem tweets from Mongodb...")
get_tweets(rep_tweets, False)

formatted_df = pd.DataFrame(formatted,columns=['date','id','username','text','party','labels'])
df = df.append(formatted_df, ignore_index=True)

print("Total Stored Tweets:")
print(len(df))
display(df.head(10))


79754
19939


Unnamed: 0,date,id,username,text,party,labels
0,2021-10-13 19:47:44,1448374915636383745,SenatorHassan,Happy th birthday to the @USNavy! The strength...,Democrat,1
1,2021-06-30 14:53:13,1410250073003462656,SenatorMenendez,The greatest generation's investment in infras...,Democrat,1
2,2021-08-08 01:11:29,1424176405881966599,SenBillCassidy,"Thanks to @SenTedCruz and @SenatorWarnock, th...",Republican,0
3,2021-04-14 14:02:49,1382333523567185921,SenBlumenthal,/ To get lasting change we cant just lock up t...,Democrat,1
4,2021-12-11 16:06:38,1469700160934621188,SenatorBraun,Today were celebrating years of the Hoosier st...,Republican,0
5,2021-05-11 17:18:50,1392167324401143812,SenJeffMerkley,The #ForthePeopleAct includes reforms that are...,Democrat,1
6,2021-08-10 16:22:55,1425130548620578816,SenBlumenthal,"Todays strong, bipartisan vote is just the beg...",Democrat,1
7,2021-08-12 01:13:26,1425626448518385664,SenatorHagerty,Supporting crime victims requires holding crim...,Republican,0
8,2021-12-11 16:27:51,1469705498224209928,SenBillCassidy,We in Louisiana know how natural disasters cha...,Republican,0
9,2021-01-20 13:55:04,1351890994220957698,CoryBooker,Today we start anew.,Democrat,1


99693
Retrieving Dem tweets from Mongodb...
|████████████████████████████████████████| 46460/46460 [100%] in 17.5s (2638.00/s)                                      
Retrieving Dem tweets from Mongodb...
|████████████████████████████████████████| 45327/45327 [100%] in 18.3s (2471.67/s)                                      


Unnamed: 0,date,id,username,text,party,labels
0,2023-02-19 17:35:30,1627361243689254914,BarackObama,Dr. Blank's four years serving in my administr...,Democrat,1
1,2023-02-14 14:01:18,1625495400470958082,BarackObama,"Happy Valentine’s Day to the one and only, @Mi...",Democrat,1
2,2023-02-13 14:43:21,1625143593286438912,BarackObama,"Congratulations to the Kansas City @Chiefs, Pa...",Democrat,1
3,2023-02-07 01:35:28,1622770990710235136,BarackObama,The scale of devastation after the earthquakes...,Democrat,1
4,2023-02-01 22:01:41,1620905250742730756,BarackObama,Black History Month is about the shared experi...,Democrat,1
...,...,...,...,...,...,...
91782,2022-05-13 15:00:51,1525128949583462402,RepWalberg,Religious liberty is foundational to America. ...,Republican,0
91783,2022-05-12 20:36:47,1524851099211120640,RepWalberg,"During #NationalPoliceWeek, I want to give a h...",Republican,0
91784,2022-05-12 17:47:11,1524808421488336896,RepWalberg,The same week that gas prices hit a record hig...,Republican,0
91785,2022-05-12 15:26:15,1524772953501712384,RepWalberg,Amid another mounting crisis—a nationwide shor...,Republican,0


Total Stored Tweets:
191480


Unnamed: 0,date,id,username,text,party,labels
191479,2022-05-12 14:42:31,1524761948155437059,RepWalberg,"Unreal. Just one more reason the @DHSgov ""Dis...",Republican,0


In [None]:
# Remove duplicate Tweets (Num duplicate tweets: 989)
duplicates = df[df.duplicated('id')]
print(f"Num duplicate tweets: {len(duplicates)}")
df.drop_duplicates('id', inplace=True)
print(len(df))


Num duplicate tweets: 989
190491


In [None]:
df.to_parquet('/content/drive/MyDrive/DS 3010 - Group Project/formatted_data.parquet', engine='fastparquet')

In [None]:
# Read all data from saved parquet file

df_all = pd.read_parquet("/content/drive/MyDrive/DS 3010 - Group Project/formatted_data.parquet", engine='fastparquet')
print(len(df_all))


190491


**Model Training**

In [None]:
%%capture
!pip install transformers datasets evaluate

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

dataset = load_dataset('Jacobvs/PoliticalTweets', split='train')
dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

print(dataset['test'][0])

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading and preparing dataset parquet/Jacobvs--PoliticalTweets to /root/.cache/huggingface/datasets/Jacobvs___parquet/Jacobvs--PoliticalTweets-a1faa32ac2009b25/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Jacobvs___parquet/Jacobvs--PoliticalTweets-a1faa32ac2009b25/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
{'index': 173218, 'date': '2022-12-22 15:55:29', 'id': 1605955192335192064, 'username': 'RepNancyMace', 'text': 'Last week we called on Congressional leadership to bring the Equal Pay for Team USA Act to the Floor for a vote - t… https://t.co/5HVSEcAvvJ', 'party': 'Republican', 'labels': 0}


In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/152392 [00:00<?, ? examples/s]

Map:   0%|          | 0/38099 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets['train'][0])

small_train_dataset = tokenized_datasets["train"].shuffle(seed=88)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=88)

{'index': 117897, 'date': '2023-01-22 19:08:15', 'id': 1617237727895818244, 'username': 'RepStenyHoyer', 'text': 'Americans woke up to news of yet another senseless mass shooting today and our hearts are with the loved ones of th… https://t.co/444J9XldAZ', 'party': 'Democrat', 'labels': 1, 'input_ids': [101, 4841, 8271, 2039, 2000, 2739, 1997, 2664, 2178, 3168, 3238, 3742, 5008, 2651, 1998, 2256, 8072, 2024, 2007, 1996, 3866, 3924, 1997, 16215, 1529, 16770, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 4008, 2549, 3501, 2683, 2595, 15150, 2480, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="/content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2", 
                                  logging_steps=3500,
                                  save_steps=1000,
                                  evaluation_strategy="steps")

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
   
trainer.train('/content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-48000')

Loading model from /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-48000.
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: date, username, id, index, text, party. If date, username, id, index, text, party are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 152392
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 57147
  Number of trainable parameters = 66955010
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 48000
  Will skip the first 2 epochs then the first 9902 batches in the first epoch. If this takes a lot of time, you can

  0%|          | 0/9902 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Accuracy
49000,0.2449,0.243717,0.930707
52500,0.2443,0.244062,0.930051


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: date, username, id, index, text, party. If date, username, id, index, text, party are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 38099
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-49000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-49000/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-49000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-50000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-50000/config.json
Model weigh

Step,Training Loss,Validation Loss,Accuracy
49000,0.2449,0.243717,0.930707
52500,0.2443,0.244062,0.930051
56000,0.2436,0.240343,0.931442


Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-54000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-54000/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-54000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-55000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-55000/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-55000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: date, username, id, index, text, party. If date, username, id, index, text, party are not expected by `DistilBertForSequenceClassification.forwa

TrainOutput(global_step=57147, training_loss=0.03892265353333905, metrics={'train_runtime': 5404.5185, 'train_samples_per_second': 84.591, 'train_steps_per_second': 10.574, 'total_flos': 6.056091544795546e+16, 'train_loss': 0.03892265353333905, 'epoch': 3.0})

In [None]:
# evaluate the current model after training
# trainer.evaluate()
# saving the fine tuned model & tokenizer
model_path = "/content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

trainer.save_model("/content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model")
trainer.save_state()

Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model/pytorch_model.bin


AttributeError: ignored

[finetuning bert with huggingface](https://www.thepythoncode.com/code/finetuning-bert-using-huggingface-transformers-python)