In [None]:
!pip install -qq transformers

[K     |████████████████████████████████| 2.3MB 6.9MB/s 
[K     |████████████████████████████████| 3.3MB 38.2MB/s 
[K     |████████████████████████████████| 901kB 45.2MB/s 
[?25h

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

In [None]:
import pandas as pd

In [None]:
import numpy as np
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict,abc
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import files

In [None]:
!nvidia-smi

Fri May 21 17:19:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataScience/final_final_version.csv')

In [None]:
df

Unnamed: 0,Tweets,Political_leaning
0,These men and women made the ultimate sacrific...,1
1,"Yesterday, I cosponsored a resolution that pas...",1
2,Today and I introduced legislation to establis...,1
3,Today I sent a letter about his denial on ‘cat...,1
4,FAA has awarded $12.28M in grant funding to im...,1
...,...,...
973502,- thank you for great tour -- and reminder of ...,1
973503,Obama considering slashing our nuclear arsenal...,1
973504,- thanks to you and whole crew at your great s...,1
973505,Tune in tonight guest hosting discussing Obama...,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973507 entries, 0 to 973506
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Tweets             973507 non-null  object
 1   Political_leaning  973507 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 14.9+ MB


In [None]:
class_names = ['Democratic','Republican']

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [None]:
encoding = tokenizer.encode_plus(
  df.loc[0,'Tweets'],
  max_length=120,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
encoding.keys()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


dict_keys(['input_ids', 'attention_mask'])

In [None]:
# encoding['input_ids'].flatten()

In [None]:
# token_lens = []
# for txt in tqdm_notebook(df.Tweets):
#   tokens = tokenizer.encode(txt,max_length=240)
#   token_lens.append(len(tokens))

In [None]:
# sns.distplot(token_lens)
# plt.xlim([0, 256]);
# plt.xlabel('Token count');

In [None]:
from tqdm import tqdm_notebook

In [None]:
MAX_LEN=100

In [None]:
class PoliticalLeaning(Dataset):
  def __init__(self, tweets, targets, tokenizer, max_len):
    self.tweets = tweets
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.tweets)

  def __getitem__(self,item):
    tweets = str(self.tweets[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
    tweets,
    max_length=self.max_len,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',  # Return PyTorch tensors
    )

    return {'tweets_text':tweets,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'targets':torch.tensor(target,dtype=torch.long)}

In [None]:
df_train,df_test = train_test_split(df,test_size=0.9,random_state=RANDOM_SEED,stratify=df['Political_leaning'])
df_val,df_test = train_test_split(df_train,test_size=0.5,random_state=RANDOM_SEED,stratify=df_train['Political_leaning'])

In [None]:
df_train.shape, df_val.shape, df_test.shape

((97350, 2), (48675, 2), (48675, 2))

In [None]:
def create_data_loader(df, tokenizer, max_len,batch_size):
  ds = PoliticalLeaning(tweets=df.Tweets.to_numpy(),
                        targets=df.Political_leaning.to_numpy(),
                        tokenizer=tokenizer,
                        max_len=max_len)
  return DataLoader(ds,batch_size=batch_size,num_workers=2)

In [None]:
BATCH_SIZE=16

In [None]:
train_data_loader = create_data_loader(df_train,tokenizer,MAX_LEN,BATCH_SIZE)
val_data_loader = create_data_loader(df_val,tokenizer,MAX_LEN,BATCH_SIZE)
test_data_loader = create_data_loader(df_test,tokenizer,MAX_LEN,BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))



In [None]:
data.keys()

dict_keys(['tweets_text', 'input_ids', 'attention_mask', 'targets'])

In [None]:
data['input_ids'].shape,data['attention_mask'].shape, data['targets'].shape,len(data['tweets_text'])

(torch.Size([16, 100]), torch.Size([16, 100]), torch.Size([16]), 16)

Building our model

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
last_hidden_state,pooled_output = bert_model(input_ids=encoding['input_ids'],attention_mask=encoding['attention_mask'])

In [None]:
last_hidden_state, pooled_output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask'],
  return_dict=False
)

In [None]:
last_hidden_state.shape, pooled_output.shape

(torch.Size([1, 120, 768]), torch.Size([1, 768]))

In [None]:
bert_model.config.hidden_size

768

In [None]:
class PoliticalLeaningClassifier(nn.Module):
  def __init__(self,n_classes):
    super(PoliticalLeaningClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size,n_classes)
  def forward(self,input_ids,attention_mask):
    _,pooled_output = self.bert(input_ids=input_ids,attention_mask = attention_mask,return_dict=False)
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
model = PoliticalLeaningClassifier(len(class_names))
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
input_ids = data['input_ids'].to(device)
attention_mask= data['attention_mask'].to(device)

In [None]:
input_ids.shape,attention_mask.shape

(torch.Size([16, 100]), torch.Size([16, 100]))

In [None]:
model(input_ids,attention_mask)

tensor([[-0.7212, -0.1746],
        [-0.4057, -0.0570],
        [-0.1577, -0.4531],
        [-1.0542, -0.5611],
        [ 0.4621, -0.8088],
        [-1.2529, -0.5181],
        [-0.6856, -0.6011],
        [-0.7108, -1.1961],
        [-0.1124, -1.0052],
        [-0.2163, -0.8994],
        [-0.4457, -0.9567],
        [-0.3381, -0.8772],
        [-0.9602, -0.5482],
        [-0.7608, -0.8844],
        [-0.4807, -0.7386],
        [-1.2756, -0.4617]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
torch.max(model(input_ids,attention_mask),dim=1)

torch.return_types.max(values=tensor([-0.1929, -0.2416, -0.1280, -0.4141, -0.7498, -0.6918, -0.7081, -0.3081,
        -0.7332, -0.1420, -0.2414, -0.0744, -0.1184, -0.7588, -0.4415, -0.6538],
       device='cuda:0', grad_fn=<MaxBackward0>), indices=tensor([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0], device='cuda:0'))

In [None]:
EPOCHS=5
optimizer = AdamW(model.parameters(),lr=2e-5,correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
!nvidia-smi

Fri May 21 17:22:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    27W /  70W |   3922MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples):
  model = model.train()
  losses=[]
  correct_predictions=0
  for d in data_loader:
    input_ids=d['input_ids'].to(device)
    attention_mask=d['attention_mask'].to(device)
    targets = d['targets'].to(device)

    outputs =  model(input_ids=input_ids,attention_mask = attention_mask)
    _,preds = torch.max(outputs,dim=1)
    loss= loss_fn(outputs,targets)

    correct_predictions+= torch.sum(preds==targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(),max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double()/n_examples,np.mean(losses)

In [None]:
def eval_model(model,data_loader,loss_fn,device,n_examples):
  model = model.eval()
  losses=[]
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      targets = d['targets'].to(device)

      outputs = model(input_ids=input_ids,attention_mask = attention_mask)
      _,preds = torch.max(outputs,dim=1)
      
      loss = loss_fn(outputs,targets)

      correct_predictions+=torch.sum(preds==targets)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples,np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-'*10)

  train_acc,train_loss = train_epoch(model, train_data_loader,loss_fn, optimizer,device,scheduler,len(df_train))

  print(f'Train Loss {train_loss} Accuracy {train_acc}')

  val_acc,val_loss = eval_model(model, val_data_loader,loss_fn,device,len(df_val))

  print(f'Val Loss {val_loss} Accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc>best_accuracy:
    model_name = f'BERT_classifier_{epoch}_Political_leaning.pt'
    path = f"/content/drive/MyDrive/dataScience/Political_leaning/{model_name}" 
    torch.save(model.state_dict(), path)
    best_accuracy = val_acc

Epoch 1/5
----------




Train Loss 0.5424099762614085 Accuracy 0.7144838212634823




Val Loss 0.38008995583348865 Accuracy 0.8267077555213148



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 2/5
----------




Train Loss 0.37589761916557196 Accuracy 0.8284540318438623




Val Loss 0.2221718644224428 Accuracy 0.9124601951720596



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 3/5
----------




Train Loss 0.2525263201081077 Accuracy 0.900924499229584




Val Loss 0.11915884183368099 Accuracy 0.9566923472008217



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 4/5
----------




Train Loss 0.18609671953353105 Accuracy 0.9406574216743708




Val Loss 0.07398485005339123 Accuracy 0.9778325629173087



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch 5/5
----------




Train Loss 0.14656075898582963 Accuracy 0.960215716486903




Val Loss 0.04676122409488779 Accuracy 0.9869337442218798



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CPU times: user 2h 57min 56s, sys: 2min 45s, total: 3h 42s
Wall time: 3h 2min 12s


In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()



0.9877144324601952

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["tweets_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)



In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

              precision    recall  f1-score   support

  Democratic       0.99      0.99      0.99     24447
  Republican       0.99      0.99      0.99     24228

    accuracy                           0.99     48675
   macro avg       0.99      0.99      0.99     48675
weighted avg       0.99      0.99      0.99     48675



In [None]:
# !sudo chmod -R 755 /content/drive/MyDrive/leader_tweets

In [None]:
# new_df.to_csv('final_final_version.csv',index=False)

In [None]:
# %cp final_final_version.csv /content/drive/MyDrive/leader_tweets   

In [None]:
# !sudo chmod -R 755 /content/drive/MyDrive/leader_tweets

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);