In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [None]:
posts_30K_each = pd.read_csv('/content/drive/MyDrive/TCC II/all_30k_processed_posts.csv')

In [None]:
posts_30K_each

Unnamed: 0.1,Unnamed: 0,created_time,id,pre_processed_message,message_min_processed,shares,status_type,full_picture,reactions_like,reactions_haha,reactions_wow,reactions_sad,reactions_angry,reactions_love,has_textual_message,author,label
0,17518,2020-06-08T16:20:29+0000,167637636622585_3560581307328184,exministro juiz sergio moro detonou conducao g...,o ex-ministro e juiz sergio moro detonou a con...,291,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,836,92,16,36,11,4,True,left_all_posts,0
1,36123,2018-11-05T14:01:00+0000,219188754789679_2341230762585457,orlandeli,por orlandeli .,1,added_photos,https://scontent.fplu2-1.fna.fbcdn.net/v/t1.64...,13,1,0,0,0,3,True,left_all_posts,0
2,2233,2019-06-02T00:30:19+0000,371913239843447_864924640542302,audio julgamento bolsonaro tribunal militar jo...,audio do julgamento de bolsonaro no tribunal m...,437,added_video,https://scontent.fplu2-1.fna.fbcdn.net/v/t15.5...,201,18,21,1,1,1,True,left_all_posts,0
3,24570,2020-05-20T02:37:30+0000,292074710916413_1524289557744683,conheca plataforma enfrentamento pandemia coro...,conheca a plataforma para o enfrentamento da p...,617,added_video,https://scontent.fplu2-1.fna.fbcdn.net/v/t15.5...,591,7,7,289,19,23,True,left_all_posts,0
4,65984,2020-02-18T21:54:34+0000,127835925882_10157935816100883,forcas opostas fundo manutencao desenvolviment...,forcas opostas : o fundo de manutencao e desen...,85,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,41,0,1,14,23,2,True,left_all_posts,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61056,3318,2020-07-15T20:01:40+0000,890711084350263_3241472082607473,bolsonaro ainda covid segundo novo exame,bolsonaro ainda esta com covid-19 segundo novo...,5,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,277,3,0,38,0,3,True,right_all_posts,1
61057,10,2018-03-05T21:23:11+0000,890711084350263_1688888311199199,amanha julgamento hc lula ladrao stj transmiti...,e amanha o julgamento do hc do lula ladrao . s...,69,shared_story,https://external.fplu2-1.fna.fbcdn.net/safe_im...,296,6,5,0,17,10,True,right_all_posts,1
61058,33469,2019-05-07T21:14:44+0000,1965770023473808_2188151608163126,bolsonaro garante recursos colegios militares,bolsonaro garante recursos para colegios milit...,70,added_video,https://scontent.fplu2-1.fna.fbcdn.net/v/t15.1...,404,3,2,0,0,68,True,right_all_posts,1
61059,8196,2019-02-20T02:26:23+0000,550142935316937_847946782203216,presidente jair bolsonaro realidade petistas c...,nosso presidente jair bolsonaro . realidade qu...,214,added_photos,https://scontent.fplu2-1.fna.fbcdn.net/v/t1.64...,440,0,3,0,0,45,True,right_all_posts,1


In [None]:
text = posts_30K_each.pre_processed_message.values
labels = posts_30K_each.label.values

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'neuralmind/bert-large-portuguese-cased',
    do_lower_case = True
    )

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
val_ratio = 0.2
batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [None]:

model = BertForSequenceClassification.from_pretrained(
    'neuralmind/bert-large-portuguese-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)


optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )


model.cuda()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========

    model.train()
    

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========


    model.eval()


    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():

          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        if b_precision != 'nan': val_precision.append(b_precision)
        if b_recall != 'nan': val_recall.append(b_recall)
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Epoch:  25%|██▌       | 1/4 [23:08<1:09:25, 1388.40s/it]


	 - Train loss: 0.6052
	 - Validation Accuracy: 0.4982
	 - Validation Precision: NaN
	 - Validation Recall: 0.0000
	 - Validation Specificity: 1.0000



Epoch:  50%|█████     | 2/4 [46:15<46:15, 1387.81s/it]  


	 - Train loss: 0.7039
	 - Validation Accuracy: 0.4982
	 - Validation Precision: NaN
	 - Validation Recall: 0.0000
	 - Validation Specificity: 1.0000



Epoch:  75%|███████▌  | 3/4 [1:09:23<23:07, 1387.63s/it]


	 - Train loss: 0.7032
	 - Validation Accuracy: 0.5018
	 - Validation Precision: 0.5018
	 - Validation Recall: 1.0000
	 - Validation Specificity: 0.0000



Epoch: 100%|██████████| 4/4 [1:32:30<00:00, 1387.65s/it]


	 - Train loss: 0.7012
	 - Validation Accuracy: 0.4982
	 - Validation Precision: NaN
	 - Validation Recall: 0.0000
	 - Validation Specificity: 1.0000






In [None]:
new_sentence = 'bozo'

test_ids = []
test_attention_mask = []

encoding = preprocessing(new_sentence, tokenizer)

test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Direita' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Esquerda'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  bozo
Predicted Class:  Esquerda




In [None]:
model.save_pretrained('/content/drive/MyDrive/TCC II/30K-4-epochs-neuralmind-bert-large-portuguese-cased/model')

In [None]:
train_idx
train = ",".join(map(str, train_idx));
train

'36343,38944,35639,54008,57863,49696,50471,38712,46335,37317,22756,3294,18058,45302,48753,1867,45886,51219,9696,41685,26455,23707,17190,7741,24459,26913,29515,44419,35462,21785,17113,23258,14581,23949,39000,29710,54684,54083,34459,10509,45410,52417,48858,27052,20183,2394,57538,55953,27866,34657,22129,41289,18677,19761,41521,32212,36492,49464,28118,30214,18258,48136,25256,51046,53893,8662,47871,24530,41753,1833,45928,12672,30145,1071,25931,46986,31768,50022,6501,1974,49793,12385,54614,50930,37999,152,40097,42234,17612,34937,43036,19946,33340,57477,7415,4410,10347,24149,47826,29849,23958,29831,20906,37776,47549,26520,46371,3960,18987,29139,10654,37830,40411,17987,1673,30643,58979,28357,40503,21913,1658,30361,23437,23652,56333,32378,696,57212,14502,20827,8788,58288,55730,36851,31030,50989,60638,825,6976,46015,3720,59569,39642,36797,31358,29827,11405,3095,58566,36126,29701,23931,2914,46539,4660,12213,23039,44177,53036,26197,20525,21836,41436,2759,13027,59955,22326,21487,716,28389,41425,561

In [None]:
val_idx
val = ",".join(map(str, val_idx));
val

'37851,4381,45694,30953,33270,41958,59621,45137,21291,1595,54201,43297,1234,52163,57714,24868,56035,34704,51066,19980,10762,60324,22486,22422,43774,42516,7822,14886,57560,52992,50429,27486,779,53187,54022,2155,40391,41110,28400,8941,11194,23871,59564,53666,45340,38819,47947,284,48554,50599,27981,51071,30709,51309,31374,40499,59109,40061,29451,36191,48786,21875,57781,40726,11623,48477,2403,25960,59367,48675,46406,46130,30662,15615,15810,19167,18114,56400,297,24458,7795,48988,58426,4502,29685,56241,52237,6215,59300,23677,7960,30653,33753,57908,53806,38117,13974,39336,58522,30889,34899,39049,20184,20102,32984,19107,45250,14567,3558,24990,61020,56681,33077,34964,50409,30009,34194,59923,569,27784,33354,27610,18628,25467,38302,27181,37519,37641,57699,18352,40834,20621,60823,40484,22359,29280,44764,58653,25489,32213,36324,30399,59814,3717,48064,40523,104,7538,36086,16399,15774,1271,41388,38548,35260,23351,35515,1543,24379,19151,20573,40745,3689,22570,8626,44564,11694,40765,45366,29903,58045,2

In [None]:
from transformers import AutoModel 

In [None]:
model = AutoModel.from_pretrained("/content/drive/MyDrive/TCC II/30K-4-epochs-neuralmind-bert-base-portuguese-cased/model",local_files_only=True)


Some weights of the model checkpoint at /content/drive/MyDrive/TCC II/30K-4-epochs-neuralmind-bert-base-portuguese-cased/model were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
all_posts_df_tests = pd.read_csv('/content/drive/MyDrive/TCC II/all_30k_to_test_posts.csv')



In [None]:
text = all_posts_df_tests.pre_processed_message.values
labels = all_posts_df_tests.label.values

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:

batch_size = 16


test_idx = list(range(len(labels)))

test_set = TensorDataset(token_id[test_idx], 
                        attention_masks[test_idx], 
                        labels[test_idx])

test_dataloader = DataLoader(
            test_set,
            sampler = SequentialSampler(test_set),
            batch_size = batch_size
        )

In [None]:

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

model.cuda()

In [None]:
new_sentence = 'bozo'

test_ids = []
test_attention_mask = []

encoding = preprocessing(new_sentence, tokenizer)

test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Direita' if np.argmax(output.last_hidden_state.cpu().numpy()).flatten().item() == 1 else 'Esquerda'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)


Input Sentence:  bozo
Predicted Class:  Esquerda
