In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [5]:
posts_30K_each = pd.read_csv('/content/drive/MyDrive/TCC II/all_30k_processed_posts.csv')

In [6]:
posts_30K_each

Unnamed: 0.1,Unnamed: 0,created_time,id,pre_processed_message,message_min_processed,shares,status_type,full_picture,reactions_like,reactions_haha,reactions_wow,reactions_sad,reactions_angry,reactions_love,has_textual_message,author,label
0,17518,2020-06-08T16:20:29+0000,167637636622585_3560581307328184,exministro juiz sergio moro detonou conducao g...,o ex-ministro e juiz sergio moro detonou a con...,291,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,836,92,16,36,11,4,True,left_all_posts,0
1,36123,2018-11-05T14:01:00+0000,219188754789679_2341230762585457,orlandeli,por orlandeli .,1,added_photos,https://scontent.fplu2-1.fna.fbcdn.net/v/t1.64...,13,1,0,0,0,3,True,left_all_posts,0
2,2233,2019-06-02T00:30:19+0000,371913239843447_864924640542302,audio julgamento bolsonaro tribunal militar jo...,audio do julgamento de bolsonaro no tribunal m...,437,added_video,https://scontent.fplu2-1.fna.fbcdn.net/v/t15.5...,201,18,21,1,1,1,True,left_all_posts,0
3,24570,2020-05-20T02:37:30+0000,292074710916413_1524289557744683,conheca plataforma enfrentamento pandemia coro...,conheca a plataforma para o enfrentamento da p...,617,added_video,https://scontent.fplu2-1.fna.fbcdn.net/v/t15.5...,591,7,7,289,19,23,True,left_all_posts,0
4,65984,2020-02-18T21:54:34+0000,127835925882_10157935816100883,forcas opostas fundo manutencao desenvolviment...,forcas opostas : o fundo de manutencao e desen...,85,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,41,0,1,14,23,2,True,left_all_posts,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61056,3318,2020-07-15T20:01:40+0000,890711084350263_3241472082607473,bolsonaro ainda covid segundo novo exame,bolsonaro ainda esta com covid-19 segundo novo...,5,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,277,3,0,38,0,3,True,right_all_posts,1
61057,10,2018-03-05T21:23:11+0000,890711084350263_1688888311199199,amanha julgamento hc lula ladrao stj transmiti...,e amanha o julgamento do hc do lula ladrao . s...,69,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,296,6,5,0,17,10,True,right_all_posts,1
61058,33469,2019-05-07T21:14:44+0000,1965770023473808_2188151608163126,bolsonaro garante recursos colegios militares,bolsonaro garante recursos para colegios milit...,70,added_video,https://scontent.fplu2-1.fna.fbcdn.net/v/t15.1...,404,3,2,0,0,68,True,right_all_posts,1
61059,8196,2019-02-20T02:26:23+0000,550142935316937_847946782203216,presidente jair bolsonaro realidade petistas c...,nosso presidente jair bolsonaro . realidade qu...,214,added_photos,https://scontent.fplu2-1.fna.fbcdn.net/v/t1.64...,440,0,3,0,0,45,True,right_all_posts,1


In [7]:
text = posts_30K_each.pre_processed_message.values
labels = posts_30K_each.label.values

In [8]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
val_ratio = 0.2
batch_size = 16


train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)


train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])


train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [11]:
def b_tp(preds, labels):
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [12]:

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

model.cuda()

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)

        train_output.loss.backward()
        optimizer.step()
 
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========


    model.eval()

    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Epoch:  25%|██▌       | 1/4 [06:40<20:00, 400.11s/it]


	 - Train loss: 0.4859
	 - Validation Accuracy: 0.8079
	 - Validation Precision: 0.7884
	 - Validation Recall: 0.8366
	 - Validation Specificity: 0.7790



Epoch:  50%|█████     | 2/4 [13:19<13:18, 399.40s/it]


	 - Train loss: 0.3708
	 - Validation Accuracy: 0.8224
	 - Validation Precision: 0.8390
	 - Validation Recall: 0.7923
	 - Validation Specificity: 0.8517



Epoch:  75%|███████▌  | 3/4 [19:57<06:39, 399.17s/it]


	 - Train loss: 0.2877
	 - Validation Accuracy: 0.8189
	 - Validation Precision: 0.8469
	 - Validation Recall: 0.7754
	 - Validation Specificity: 0.8624



Epoch: 100%|██████████| 4/4 [26:36<00:00, 399.18s/it]


	 - Train loss: 0.2043
	 - Validation Accuracy: 0.8204
	 - Validation Precision: 0.8195
	 - Validation Recall: 0.8177
	 - Validation Specificity: 0.8227






In [None]:
new_sentence = 'bozo'

test_ids = []
test_attention_mask = []

encoding = preprocessing(new_sentence, tokenizer)

test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Direita' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Esquerda'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  bozo
Predicted Class:  Esquerda




In [None]:
model.save_pretrained('/content/drive/MyDrive/TCC II/30K-4-epochs-bert-base-uncased/model-updated')

In [None]:
train_idx
train = ",".join(map(str, train_idx));
train

'41274,45682,41323,19625,16165,11477,6991,36254,37756,44576,12207,47135,54970,6835,37646,59391,56143,17082,58886,33941,5871,59857,3660,20536,27570,43317,46798,7675,25845,12651,50080,8875,51577,47973,3919,6012,26855,40359,18503,34454,29783,16532,21676,8767,10651,50213,39872,53694,54052,51468,30361,52681,4291,35079,52154,23168,38769,8707,36256,24698,35871,1338,12478,3319,55784,20864,24809,28690,29833,34155,58282,23740,36298,24895,43403,46268,49785,49348,2890,31632,55481,51172,42084,17132,52886,35252,10831,24661,3490,49273,53098,11108,41832,24436,28684,34711,35097,56031,8051,32309,21765,11824,24803,7276,4211,50088,28455,28047,31393,4536,35300,12564,25759,57189,44862,58740,52540,46679,39276,34093,6427,35182,41258,36444,35186,58069,34804,36790,34267,42634,3351,31232,18730,4254,50248,39755,49680,22786,30763,35421,1924,53741,56052,12246,36276,36979,13998,4517,10601,3354,52096,23625,28481,39137,11959,37404,8509,17120,26159,2776,53572,52281,48806,59902,22437,21940,37812,22275,57336,43608,37474,

In [None]:
val_idx
val = ",".join(map(str, val_idx));
val

'39388,43120,58476,53368,49635,17647,27225,36465,2739,7808,3848,1857,47469,56552,18154,20296,48176,32107,49020,55668,45157,22551,1840,41576,13130,29785,26274,43618,53991,42572,42687,22092,45616,1408,34677,55544,21984,12245,57766,16730,38019,48979,23704,15226,23774,42638,56307,53337,35463,7389,21402,30847,25047,23621,27139,46592,29310,41425,31805,33347,44399,60145,56539,3509,31517,5617,59693,12487,11483,55050,23562,39618,25392,59231,9940,26997,16633,12045,6911,19985,35117,960,49172,42114,57117,7125,39930,34065,55018,21250,59075,26284,13060,27290,31487,35713,58517,54031,16411,58842,60942,51142,15217,52368,36723,6503,19409,37220,14986,52948,59149,60430,33718,2323,59578,38282,5193,14058,51581,24503,60284,57210,29736,5153,33752,21644,51276,36856,16220,26955,27088,27656,43800,37609,36154,2319,42741,2086,35119,29923,48498,60907,37091,38154,18985,31831,3452,646,14338,3506,2006,453,16808,38557,21919,26089,49397,40992,35815,13095,37164,39384,7210,38297,60693,441,44575,17055,7327,33492,11903,5422