**By: Jacob van Steyn, ...**

# DS3010 Project : Twitter political analysis


In [None]:
%%capture
!pip install tweepy pymongo alive-progress pandas matplotlib python-dotenv fastparquet

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# VARIABLES
SHOULD_CLEAR_DB = True
SHOULD_SCRAPE = True


In [None]:
#----------------------------------------------
import pymongo
import os
import pprint
from dotenv import load_dotenv

# Using dotenv (loads from a file called '.env') to keep secrets safe
load_dotenv("/content/drive/MyDrive/DS 3010 - Group Project/.env")
MONGO_USERNAME = os.getenv("MONGO_USERNAME")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
MONGO_URL = os.getenv("MONGO_URL")

client = pymongo.MongoClient(f"mongodb+srv://{MONGO_USERNAME}:{MONGO_PASSWORD}@{MONGO_URL}/?retryWrites=true&w=majority")

dblist = client.list_database_names()
if "DS3010" in dblist:
    print("The database already exists!")
else:
    print("Database was created.")

# As mongodb won't create the database until a document is created, let's insert & delete a fake one!
db = client.DS3010
celeb_tweets = db.celeb_tweets

if SHOULD_CLEAR_DB:
  document = celeb_tweets.insert_one({"test":"temp document!"})
  print(f"Num Tweets Stored: {celeb_tweets.count_documents({})}")

  # Clear DB
  celeb_tweets.delete_many({})
  print(f"Num Tweets Stored: {celeb_tweets.count_documents({})}")




The database already exists!
Num Tweets Stored: 9747
Num Tweets Stored: 0


In [None]:
#----------------------------------------------
import json
import time
import re

import tweepy
from alive_progress import alive_bar
import pandas as pd
import matplotlib.pyplot as plt


# Initialize our tweepy client
def setup():
    # Twitter API key and secret
    
    consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
    consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
    
    # Handling authentication with Twitter
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    # Create a wrapper for the Twitter API
    api = tweepy.API(auth, wait_on_rate_limit=True)
    return api
    
# Helper function for handling handling rate limits and getting each page of results
    # Use alive_bar to display a progress bar as tweets are fetched
def limit_handler(cursor, total):
    with alive_bar(total, force_tty=True) as bar:
        while True:
            try:
                yield cursor.next()
                bar()
            except StopIteration:
                if (bar.current != total):
                    print(f"[Stopped!] Found {bar.current} tweets for specifed term!")
                break
                
api = setup()

In [None]:
# Read accounts from CSV
import csv

user_map = {}
with open("/content/drive/MyDrive/DS 3010 - Group Project/celebs.csv", "r") as file:
  # reading the CSV file
  csvFile = csv.reader(file)
 
  # displaying the contents of the CSV file
  for lines in csvFile:
        if lines[0] and lines[1]:
          user_map[lines[1]] = lines[0]

print(len(user_map))

95


In [None]:
from tweepy.parsers import TweepError
# Search term (omitting retweets)
count = 180

print("Setup! Retrieving Tweets now...")

# Search for tweets using Tweepy 
# Have first 45

deleted_users = []

if SHOULD_SCRAPE:
  for i, (user, val) in enumerate(user_map.items()):
    print(f"User: {user}, Index: {i}")
    search = limit_handler(tweepy.Cursor(api.user_timeline,
                            screen_name=user,
                            exclude_replies=True,
                            include_rts=False,
                            count=90).items(count), count)
    
    returned_tweets = []
    try: 
      for result in search:
          returned_tweets.append(result._json)
        
      if returned_tweets:
        print(f"Inserting Tweets to MongoDB...\nCurrent # Celeb Tweets Stored: {celeb_tweets.count_documents({})}")
        celeb_tweets.insert_many(returned_tweets)
        print(f"New Celeb Tweets Stored: {celeb_tweets.count_documents({})}")
      else:
        deleted_users.append(user)
        print("NO TWEETS FOR USER!")
    except TweepError:
      deleted_users.append(user)
      print("USER NOT FOUND!")
      continue
else:
  print("Not Scraping!")
  
  

# Process the results from the search using Tweepy




Setup! Retrieving Tweets now...
User: @justinbieber, Index: 0
|████████████████████████████████████████| 180/180 [100%] in 1.5s (110.68/s)                                            
Inserting Tweets to MongoDB...
Current # Celeb Tweets Stored: 0
New Celeb Tweets Stored: 180
User: @katyperry, Index: 1
|████████████████████████████████████████| 180/180 [100%] in 1.2s (147.91/s)                                            
Inserting Tweets to MongoDB...
Current # Celeb Tweets Stored: 180
New Celeb Tweets Stored: 360
User: @rihanna, Index: 2
|████████████████████████████████████████| 180/180 [100%] in 1.2s (143.94/s)                                            
Inserting Tweets to MongoDB...
Current # Celeb Tweets Stored: 360
New Celeb Tweets Stored: 540
User: @ZooeyDeschanel, Index: 3
|████████████████████████████████████████| 180/180 [100%] in 1.2s (122.36/s)                                            
Inserting Tweets to MongoDB...
Current # Celeb Tweets Stored: 540
New Celeb Tweets Stor

In [None]:
#----------------------------------------------

# Deleted accounts: ['RepKClark', 'WhipClyburn', 'RepTomEmmer', 'o official', 'RepKiggans', 'RepLuttrell_TX8', 'teammoulton', 'RepMullin']
#missing_accounts = db.missing_accounts
print(deleted_users)
#missing_accounts.insert_many({usr: user_map[usr] for usr in missing_accounts})

# Get total number of tweets

# Save tweet text into republican and democratic lists
import pandas as pd
from datetime import datetime

# file_prefix = "/content/drive/MyDrive/DS 3010 - Group Project/senators_"

# df_train = pd.read_parquet(f"{file_prefix}train.parquet", engine='fastparquet')
# df_test = pd.read_parquet(f"{file_prefix}test.parquet", engine='fastparquet')
# df = df_train.append(df_test)


parsed_tweets = db.parsed_celeb_tweets
formatted = []

def get_tweets(database):
  with alive_bar(database.count_documents({}), force_tty=True) as bar:
    for tweet in database.find().sort([('$natural', 1)]):
      date = datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y")
      formatted.append([date.strftime("%Y-%m-%d %H:%M:%S"), tweet['id'], tweet['user']['screen_name'], tweet['text']])
      bar()

print("Retrieving celeb tweets from Mongodb...")
get_tweets(celeb_tweets)

formatted_df = pd.DataFrame(formatted, columns=['date','id','username','text'])
df = formatted_df

print("Total Stored Tweets:")
print(len(df))
display(df.head(10))


['@VenessaHudgens', '@LanaDelRey', '@kanyewest', '@Nicki Minaj', '@JimCarrey']
Retrieving celeb tweets from Mongodb...
|████████████████████████████████████████| 15886/15886 [100%] in 5.0s (3076.75/s)                                       
Total Stored Tweets:
15886


Unnamed: 0,date,id,username,text
0,2022-12-01 19:24:25,1598397624750866432,justinbieber,long time coming and excited to finally announ...
1,2022-09-07 01:14:34,1567320386546589696,justinbieber,https://t.co/qRY7ltRkV0
2,2022-07-07 19:33:51,1545128982143651840,justinbieber,1 year of preparation 🔥 @FreeFire_NA #GarenaFr...
3,2022-02-16 03:05:25,1493783548276314115,justinbieber,"Go for the Gold, Ladies!!!!!! Cannot wait to w..."
4,2021-12-30 22:33:21,1476682851324239873,justinbieber,The countdown begins… https://t.co/S5qZP92Ziz
5,2021-12-14 00:14:59,1470547831434153984,justinbieber,https://t.co/j7EuZcZqnh
6,2021-12-13 17:36:04,1470447442558689289,justinbieber,LOVE MY FAMILY. 📷: @RoryKramer https://t.co/R3...
7,2021-12-13 17:33:29,1470446792588337155,justinbieber,📷: @RoryKramer https://t.co/PgZIWI1OFr
8,2021-12-13 17:30:19,1470445993229537283,justinbieber,London Town wif my baby Hailey Bieber https://...
9,2021-12-06 20:30:00,1467954498085789696,justinbieber,“Virgil was here” https://t.co/SbFar4nqu9


In [None]:
# Remove duplicate Tweets (Num duplicate tweets: 989)
duplicates = df[df.duplicated('id')]
print(f"Num duplicate tweets: {len(duplicates)}")
df.drop_duplicates('id', inplace=True)
print(len(df))


Num duplicate tweets: 0
15886


In [None]:
df.to_parquet('/content/drive/MyDrive/DS 3010 - Group Project/formatted_celeb_data.parquet', engine='fastparquet')

In [None]:
# Read all data from saved parquet file

df_all = pd.read_parquet("/content/drive/MyDrive/DS 3010 - Group Project/formatted_celeb_data.parquet", engine='fastparquet')
print(len(df_all))


15886


**Model Training**

In [None]:
%%capture
!pip install transformers datasets evaluate

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification
from datasets import load_dataset

# Load dataset
dataset = load_dataset('Jacobvs/CelebrityTweets', split='train')

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("Jacobvs/PoliticalTwitterAnalysis-Distillbert")
tokenizer = AutoTokenizer.from_pretrained("Jacobvs/PoliticalTwitterAnalysis-Distillbert")

# Create sentiment analysis pipeline
recognizer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)



In [None]:
# Define a function to apply the pipeline to each example in the dataset
outputs = {}

def predict_sentiment(example):
    sentiment = recognizer(example['text'])
    error = sentiment[0]['score']
    weight = error if sentiment[0]['label'] == 'LABEL_1' else 1-error
    uname = example['username']
    if uname in outputs:
      outputs[uname].append(weight)
    else:
      outputs[uname] = [weight]
    return {'username': uname, 'is_dem': sentiment[0]['label'], 'sentiment_score': sentiment[0]['score']}

# Generate sentiment predictions on the loaded dataset
predictions = dataset.map(predict_sentiment)

# Print the sentiment predictions
print(predictions)
print(len(outputs))



Map:   0%|          | 0/15886 [00:00<?, ? examples/s]

Dataset({
    features: ['index', 'date', 'id', 'username', 'text', 'is_dem', 'sentiment_score'],
    num_rows: 15886
})
90


In [None]:
vals = {}
for x in predictions:
  if x['username'] in vals:
    vals[x['username']][0].append(x['is_dem'] == 'LABEL_1')
    vals[x['username']][1].append(x['sentiment_score'])
  else:
    vals[x['username']] = ([x['is_dem'] == 'LABEL_1'], [x['sentiment_score']])

weights = {}
for k, v in outputs.items():
  weights[k] = [len(v), sum(v) / len(v), sum(vals[k][0]) / len(vals[k][0]), sum(vals[k][1]) / len(vals[k][1]), vals[k][0]]



In [None]:
# Open a CSV file for writing
with open('/content/drive/MyDrive/DS 3010 - Group Project/prediction_output_2.csv', 'w', newline='') as csvfile:

    # Create a CSV writer object
    writer = csv.writer(csvfile)

    # Write the header row with the keys
    writer.writerow(['Username', 'Num Tweets', 'Weighted Prediction', 'Absolute Prediction', 'Average Confidence', 'Scores'])

    # Write the data rows with keys and values
    for key, value in weights.items():
        writer.writerow([key, value[0], value[1], value[2], value[3], value[4]])

# Unused Code

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model")

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

dataset = load_dataset('Jacobvs/CelebrityTweets', split='train')

print(dataset[0])



{'index': 0, 'date': '2022-12-01 19:24:25', 'id': 1598397624750866432, 'username': 'justinbieber', 'text': 'long time coming and excited to finally announce https://t.co/dFc8NBoFs0'}


In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
display(tokenized_datasets)

Map:   0%|          | 0/15886 [00:00<?, ? examples/s]

Dataset({
    features: ['index', 'date', 'id', 'username', 'text', 'input_ids', 'attention_mask'],
    num_rows: 15886
})

In [None]:
# print(tokenized_datasets['train'][0])

# small_train_dataset = tokenized_datasets["train"].shuffle(seed=88)
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=88)

{'index': 117897, 'date': '2023-01-22 19:08:15', 'id': 1617237727895818244, 'username': 'RepStenyHoyer', 'text': 'Americans woke up to news of yet another senseless mass shooting today and our hearts are with the loved ones of th… https://t.co/444J9XldAZ', 'party': 'Democrat', 'labels': 1, 'input_ids': [101, 4841, 8271, 2039, 2000, 2739, 1997, 2664, 2178, 3168, 3238, 3742, 5008, 2651, 1998, 2256, 8072, 2024, 2007, 1996, 3866, 3924, 1997, 16215, 1529, 16770, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 4008, 2549, 3501, 2683, 2595, 15150, 2480, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import AutoModelForSequenceClassification

#model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model", num_labels=2)

In [None]:
from transformers import pipeline
model = AutoModelForSequenceClassification.from_pretrained("Jacobvs/PoliticalTwitterAnalysis-Distillbert")
recognizer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
recognizer(tokenized_datasets)

ValueError: ignored

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="/content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2", 
                                  logging_steps=3500,
                                  save_steps=1000,
                                  evaluation_strategy="steps")

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
   
trainer.train('/content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-48000')

Loading model from /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-48000.
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: date, username, id, index, text, party. If date, username, id, index, text, party are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 152392
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 57147
  Number of trainable parameters = 66955010
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 48000
  Will skip the first 2 epochs then the first 9902 batches in the first epoch. If this takes a lot of time, you can

  0%|          | 0/9902 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Accuracy
49000,0.2449,0.243717,0.930707
52500,0.2443,0.244062,0.930051


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: date, username, id, index, text, party. If date, username, id, index, text, party are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 38099
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-49000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-49000/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-49000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-50000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-50000/config.json
Model weigh

Step,Training Loss,Validation Loss,Accuracy
49000,0.2449,0.243717,0.930707
52500,0.2443,0.244062,0.930051
56000,0.2436,0.240343,0.931442


Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-54000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-54000/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-54000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-55000
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-55000/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs_2/checkpoint-55000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: date, username, id, index, text, party. If date, username, id, index, text, party are not expected by `DistilBertForSequenceClassification.forwa

TrainOutput(global_step=57147, training_loss=0.03892265353333905, metrics={'train_runtime': 5404.5185, 'train_samples_per_second': 84.591, 'train_steps_per_second': 10.574, 'total_flos': 6.056091544795546e+16, 'train_loss': 0.03892265353333905, 'epoch': 3.0})

In [None]:
# evaluate the current model after training
# trainer.evaluate()
# saving the fine tuned model & tokenizer
model_path = "/content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

trainer.save_model("/content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model")
trainer.save_state()

Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model
Configuration saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model/config.json
Model weights saved in /content/drive/MyDrive/DS 3010 - Group Project/training_outputs/final_model_trainer_save_model/pytorch_model.bin


AttributeError: ignored

[finetuning bert with huggingface](https://www.thepythoncode.com/code/finetuning-bert-using-huggingface-transformers-python)