In [1]:
import pandas as pd
import re
import os
import numpy as np
import gensim
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Convolution1D, MaxPooling1D, LSTM
from keras.layers import Input, Conv1D, MaxPool1D, Bidirectional, GRU
from keras.models import Sequential, Model
from keras.layers.merge import concatenate
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 520 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 78.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.7 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 81.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
## Clean text
def preprocess(text, remove_stop_words = False, stem=True):
    # remove link
    text = re.sub(r"(http?\://|https?\://|www)\S+", " ", str(text).lower()).strip()
    # remove newlines
    text = re.sub(r'\n', ' ', text)
    # remove puctuations and special characters
    text = re.sub(r'\W+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # remove first space
    text = re.sub(r'^\s+', '', text)
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    if remove_stop_words or stem:
        tokens = []
        for token in text.split():
            if remove_stop_words:
                if token not in stop_words:
                    if stem:
                        tokens.append(stemmer.stem(token))
                    else:
                        tokens.append(token)
            else:
                if stem:
                    tokens.append(stemmer.stem(token))
                else:
                    tokens.append(token)
        return(" ".join(tokens))
    else:
        return(text)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## load data
df = pd.read_csv("/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_text_party_balanced.csv", encoding = "UTF-8")

In [None]:
df.head()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,party
0,x20467163,x649659448751992833,2015-10-01 18:57:37,VernBuchanan,Sgt. Martland should be commended - not punish...,R
1,x247334603,x1089996836725841926,2019-01-28 21:21:11,SenatorDurbin,Don’t be fooled by the President’s tweets. The...,D
2,x63150856,x1155945861739716610,2019-07-29 20:59:04,VoteMarsha,As if decriminalizing illegal immigration and ...,R
3,x142332083,x711713134331826176,2016-03-21 00:36:48,RonJohnsonWI,Good luck to @BadgerMBB! #OnWisconsin #MarchMa...,R
4,x312134473,x1029813906234122242,2018-08-15 19:35:42,RepLindaSanchez,"6 years ago, the Obama Administration gave #DR...",D


In [None]:
%%time
df.text = df.text.apply(lambda x: preprocess(x, remove_stop_words = False, stem=False))

CPU times: user 1min 36s, sys: 1.09 s, total: 1min 37s
Wall time: 1min 46s


In [4]:
## drop NAs
df = df[df['party'].notna()]

In [5]:
possible_labels = df.party.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)

df['party'] = df.party.replace(label_dict)

{'R': 0, 'D': 1}


In [None]:
from transformers import BertTokenizer, BertModel, TFBertForSequenceClassification
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
input_ids=[]
attention_masks=[]
labels = []

for indx, row in df.iterrows():
    if isinstance(row.text, str):
        bert_inp = tokenizer.encode_plus(row.text,
                                          add_special_tokens = True,
                                          max_length =64,
                                          pad_to_max_length = True,
                                          return_attention_mask = True)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])
        labels.append(row.party)

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels = np.array(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
np.save('/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_input_ids.npy', input_ids)
np.save('/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_atttion_masks.npy', attention_masks)
np.save('/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_labels.npy', labels)

In [None]:
input_ids = np.load('/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_input_ids.npy')
attention_masks = np.load('/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_atttion_masks.npy')
labels = np.load('/content/drive/My Drive/colab_data/cong_politician_tweets_2020-3-12-2021-5-28_labels.npy')

In [None]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,
                                                                             labels,
                                                                             attention_masks,
                                                                             test_size=0.2, 
                                                                             random_state = 42)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))


Train inp shape (2003936, 64) Val input shape (500984, 64)
Train label shape (2003936,) Val label shape (500984,)
Train attention mask shape (2003936, 64) Val attention mask shape (500984, 64)


In [None]:
del input_ids, attention_masks, labels

In [None]:
import tensorflow as tf

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-07)

print('\nBert Model',model.summary())
model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [None]:
callbacks = [ tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
history=model.fit([input_ids,attention_masks],
                  labels,
                  batch_size=300,
                  epochs=14,
                  validation_data=([val_inp,val_mask],val_label),
                  #validation_split = 0.2,
                  verbose = 1,
                  callbacks=callbacks)

Epoch 1/14