In [1]:
!pip install gensim==3.8.3
!pip install keras --upgrade
!pip install pandas --upgrade
!pip install tensorflow --upgrade
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')

# Word2vec
import gensim

#transformers
from transformers import BertTokenizerFast

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#vocab_size = 290419
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
#LOAD DATASET
main_dir = '/content/drive/MyDrive/nns/'
df = pd.read_csv(main_dir+'training.1600000.processed.noemoticon.csv', encoding = DATASET_ENCODING,  names=DATASET_COLUMNS)
neg_df = df[df.target == 0].sample(n=25000, random_state=5)
pos_df = df[df.target == 4].sample(n=25000, random_state=5)

In [10]:
decode_map = {0: "NEGATIVE", 2:"NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [12]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = str(text).lower().strip()
    text = re.sub("\n", "", text)
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [13]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 53 s, sys: 273 ms, total: 53.3 s
Wall time: 55.3 s


In [14]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 1280000
TEST size: 320000


In [15]:
#LOAD POLITICIANS TWEETS
twt_load_dir = main_dir+'twitter_api_data/original/'

# tweets regarding the politicans
mehmet_oz_df = pd.read_csv(twt_load_dir+"adam_laxalt.csv", encoding =DATASET_ENCODING)
john_fetterman_df = pd.read_csv(twt_load_dir+"john_fetterman.csv")
adam_laxalt_df = pd.read_csv(twt_load_dir+"adam_laxalt.csv")
catherine_cortez_masto_df = pd.read_csv(twt_load_dir+"catherine_cortez_masto.csv")
ron_johnson_df = pd.read_csv(twt_load_dir+"ron_johnson.csv")
mandela_barnes_df = pd.read_csv(twt_load_dir+"mandela_barnes.csv")
donald_bolduc_df = pd.read_csv(twt_load_dir+"donald_bolduc.csv")
maggie_hassan_df = pd.read_csv(twt_load_dir+"maggie_hassan.csv")
ted_budd_df = pd.read_csv(twt_load_dir+"ted_budd.csv")
cheri_beasly_df = pd.read_csv(twt_load_dir+"cheri_beasly.csv")
joe_pinion_df = pd.read_csv(twt_load_dir+"joe_pinion.csv")
charles_schumer_df = pd.read_csv(twt_load_dir+"charles_schumer.csv")
jd_vance_df = pd.read_csv(twt_load_dir+"jd_vance.csv")
tim_ryan_df = pd.read_csv(twt_load_dir+"tim_ryan.csv")

In [16]:
#LOAD RNN MODEL
main_dir = '/content/drive/MyDrive/nns/'
nn_load_dir = main_dir+'saved_nn_models/'
model = tf.keras.models.load_model(nn_load_dir+'RNN.h5')
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 300, 300)          87125700  
                                                                 
 dropout_6 (Dropout)         (None, 300, 300)          0         
                                                                 
 gru_7 (GRU)                 (None, 1024)              4073472   
                                                                 
 dense_5 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 91,200,197
Trainable params: 4,074,497
Non-trainable params: 87,125,700
_________________________________________________________________


In [17]:
def decode_sentiment(score, include_neutral=False):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return 0 if score < 0.5 else 1

In [18]:
#LOAD TOKENIZER
tokenizer_path = main_dir+'tokenizer.pkl'

with open(tokenizer_path, 'rb') as f:
  tokenizer = pickle.load(f)

In [26]:
def predict(text, include_neutral=False):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test], verbose=0)[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    return label

In [31]:
def run_model(df):
  df.Tweet = df.Tweet.apply(lambda x: preprocess(x))
  df['sentiment'] = 0
  length = len(df)
  for i in range(length):
    text = df.Tweet[i]
    label = predict(text, include_neutral=False)
    df.at[i, 'sentiment'] = label
    

In [32]:
# run model on all csvs
run_model(mehmet_oz_df)
run_model(john_fetterman_df)
run_model(adam_laxalt_df)
run_model(catherine_cortez_masto_df)
run_model(ron_johnson_df)
run_model(mandela_barnes_df)
run_model(donald_bolduc_df)
run_model(maggie_hassan_df)
run_model(ted_budd_df)
run_model(cheri_beasly_df)
run_model(joe_pinion_df)
run_model(charles_schumer_df)
run_model(jd_vance_df)
run_model(tim_ryan_df)

In [35]:
# get number of positive and negatives tweets from each df
def get_sentiment(df):
    pos = 0
    neg = 0
    for index, row in df.iterrows():
        if row['sentiment'] == 0:
            neg += 1
        else:
            pos += 1
    return pos, neg

# get pos/neg ,pos/all, neg/all ratio for all dfs
def get_ratios(df):
    pos, neg = get_sentiment(df)
    pos_all = pos / (pos + neg)
    neg_all = neg / (pos + neg)
    pos_neg = pos / neg
    # round all ratios to 2 decimal places
    pos_all = round(pos_all, 2)
    neg_all = round(neg_all, 2)
    pos_neg = round(pos_neg, 2)
    return pos_all, neg_all, pos_neg

# sum number of positive and negatives tweets from list of df
def sum_sentiment(dfs):
    pos = 0
    neg = 0
    for df in dfs:
        pos_df, neg_df = get_sentiment(df)
        pos += pos_df
        neg += neg_df
    return pos, neg

# average the ratios of winners and losers
def avg_ratios(dfs):
    pos_all = 0
    neg_all = 0
    pos_neg = 0
    for df in dfs:
        pos_all_df, neg_all_df, pos_neg_df = get_ratios(df)
        pos_all += pos_all_df
        neg_all += neg_all_df
        pos_neg += pos_neg_df
    pos_all = pos_all / len(dfs)
    neg_all = neg_all / len(dfs)
    pos_neg = pos_neg / len(dfs)
    return pos_all, neg_all, pos_neg

In [36]:
# get number of positive and negatives tweets from each df
pos, neg = get_sentiment(mehmet_oz_df)
print("Mehmet Oz: ", pos, neg)
pos, neg = get_sentiment(john_fetterman_df)
print("John Fetterman: ", pos, neg)
pos, neg = get_sentiment(adam_laxalt_df)
print("Adam Laxalt: ", pos, neg)
pos, neg = get_sentiment(catherine_cortez_masto_df)
print("Catherine Cortez Masto: ", pos, neg)
pos, neg = get_sentiment(ron_johnson_df)
print("Ron Johnson: ", pos, neg)
pos, neg = get_sentiment(mandela_barnes_df)
print("Mandela Barnes: ", pos, neg)
pos, neg = get_sentiment(donald_bolduc_df)
print("Donald Bolduc: ", pos, neg)
pos, neg = get_sentiment(maggie_hassan_df)
print("Maggie Hassan: ", pos, neg)
pos, neg = get_sentiment(ted_budd_df)
print("Ted Budd: ", pos, neg)
pos, neg = get_sentiment(cheri_beasly_df)
print("Cheri Beasley: ", pos, neg)
pos, neg = get_sentiment(joe_pinion_df)
print("Joe Pinion: ", pos, neg)
pos, neg = get_sentiment(charles_schumer_df)
print("Charles Schumer: ", pos, neg)
pos, neg = get_sentiment(jd_vance_df)
print("JD Vance: ", pos, neg)
pos, neg = get_sentiment(tim_ryan_df)
print("Tim Ryan: ", pos, neg)

Mehmet Oz:  145 75
John Fetterman:  75 425
Adam Laxalt:  145 75
Catherine Cortez Masto:  100 30
Ron Johnson:  485 15
Mandela Barnes:  340 100
Donald Bolduc:  15 10
Maggie Hassan:  115 15
Ted Budd:  60 440
Cheri Beasley:  155 10
Joe Pinion:  30 20
Charles Schumer:  115 60
JD Vance:  305 195
Tim Ryan:  300 200


In [37]:
#get ratios of pos:neg tweets from all dfs
pos_all, neg_all, pos_neg = get_ratios(mehmet_oz_df)
print("Mehmet Oz: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(john_fetterman_df)
print("John Fetterman: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(adam_laxalt_df)
print("Adam Laxalt: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(catherine_cortez_masto_df)
print("Catherine Cortez Masto: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(ron_johnson_df)
print("Ron Johnson: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(mandela_barnes_df)
print("Mandela Barnes: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(donald_bolduc_df)
print("Donald Bolduc: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(maggie_hassan_df)
print("Maggie Hassan: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(ted_budd_df)
print("Ted Budd: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(cheri_beasly_df)
print("Cheri Beasley: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(joe_pinion_df)
print("Joe Pinion: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(charles_schumer_df)
print("Charles Schumer: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(jd_vance_df)
print("JD Vance: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(tim_ryan_df)
print("Tim Ryan: ", pos_all, neg_all, pos_neg)

Mehmet Oz:  0.66 0.34 1.93
John Fetterman:  0.15 0.85 0.18
Adam Laxalt:  0.66 0.34 1.93
Catherine Cortez Masto:  0.77 0.23 3.33
Ron Johnson:  0.97 0.03 32.33
Mandela Barnes:  0.77 0.23 3.4
Donald Bolduc:  0.6 0.4 1.5
Maggie Hassan:  0.88 0.12 7.67
Ted Budd:  0.12 0.88 0.14
Cheri Beasley:  0.94 0.06 15.5
Joe Pinion:  0.6 0.4 1.5
Charles Schumer:  0.66 0.34 1.92
JD Vance:  0.61 0.39 1.56
Tim Ryan:  0.6 0.4 1.5


In [38]:
# winners vs losers
winners = [john_fetterman_df, catherine_cortez_masto_df, ron_johnson_df, maggie_hassan_df, ted_budd_df, charles_schumer_df, jd_vance_df]
losers = [mehmet_oz_df, adam_laxalt_df, mandela_barnes_df, donald_bolduc_df, cheri_beasly_df, joe_pinion_df, tim_ryan_df]

In [39]:
#sum number of pos and neg tweets from list of df
pos, neg = sum_sentiment(winners)
print("Winners: ", pos, neg)
pos, neg = sum_sentiment(losers)
print("Losers: ", pos, neg)

Winners:  1255 1180
Losers:  1130 490


In [40]:
# average the ratios of winners and losers
pos_all, neg_all, pos_neg = avg_ratios(winners)
print("Winners: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = avg_ratios(losers)
print("Losers: ", pos_all, neg_all, pos_neg)

Winners:  0.5942857142857143 0.4057142857142857 6.732857142857143
Losers:  0.6899999999999998 0.31 3.894285714285714


In [41]:
# democrats vs republicans
dems = [mehmet_oz_df, adam_laxalt_df, ron_johnson_df, donald_bolduc_df, ted_budd_df, joe_pinion_df, jd_vance_df]
reps = [john_fetterman_df, catherine_cortez_masto_df, mandela_barnes_df, maggie_hassan_df, cheri_beasly_df, charles_schumer_df, tim_ryan_df]


In [42]:
#sum numer of dem and rep tweets from list of df
pos, neg = sum_sentiment(dems)
print("Democrats: ", pos, neg)
pos, neg = sum_sentiment(reps)
print("Republicans: ", pos, neg)

Democrats:  1185 830
Republicans:  1200 840


In [43]:
#average the ratios of dems and reps
pos_all, neg_all, pos_neg = avg_ratios(dems)
print("Democrats: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = avg_ratios(reps)
print("Republicans: ", pos_all, neg_all, pos_neg)

Democrats:  0.602857142857143 0.3971428571428572 5.841428571428572
Republicans:  0.6814285714285714 0.3185714285714286 4.785714285714286


In [44]:
# convert all dfs to csv
path = main_dir+'politician_csvs/'
john_fetterman_df.to_csv(path+"john_fetterman_annotated.csv")
mehmet_oz_df.to_csv(path+"mehmet_oz_annotated.csv")
adam_laxalt_df.to_csv(path+"adam_laxalt_annotated.csv")
catherine_cortez_masto_df.to_csv(path+"catherine_cortez_masto_annotated.csv")
ron_johnson_df.to_csv(path+"ron_johnson_annotated.csv")
mandela_barnes_df.to_csv(path+"mandela_barnes_annotated.csv")
donald_bolduc_df.to_csv(path+"donald_bolduc_annotated.csv")
maggie_hassan_df.to_csv(path+"maggie_hassan_annotated.csv")
ted_budd_df.to_csv(path+"ted_budd_annotated.csv")
cheri_beasly_df.to_csv(path+"cheri_beasly_annotated.csv")
joe_pinion_df.to_csv(path+"joe_pinion_annotated.csv")
charles_schumer_df.to_csv(path+"charles_schumer_annotated.csv")
jd_vance_df.to_csv(path+"jd_vance_annotated.csv")
tim_ryan_df.to_csv(path+"tim_ryan_annotated.csv")