In [17]:
# !pip install sklearn torch datasets transformers

In [19]:
import pandas as pd

data = pd.read_csv("df_dataset.csv")

In [20]:
from functools import reduce
import numpy as np

# Embaralha e separa os dados em 10 partes
shuffled = data.sample(frac=1)
result = np.array_split(shuffled, 20)

# Pega 9 partes e reune novamente para formar o conjunto de treino
data_frames = []
for i in range(19):
    data_frames.append(result[i])
train_data = reduce(lambda  left,right: pd.merge(left,right,how='outer'), data_frames)

# Pega uma parte para formar o conjunto de validacao
# val_data = result[18]

# Pega a ultima parte para formar o conjunto de teste
test_data = result[19]

In [21]:
from datasets import Dataset, DatasetDict

print(train_data)
train_data = Dataset.from_pandas(train_data)
test_data = Dataset.from_pandas(test_data)

raw_datasets = DatasetDict({'train': train_data, 'test': test_data})

train_data
test_data
raw_datasets

      Unnamed: 0                docno  has_anger      origin  \
0           3924                27940          1  55chan/pol   
1           1916  1141767214728733056          0     twitter   
2           2748                59319          1  55chan/pol   
3           5516  1141761023944338944          0     twitter   
4           4795  1141764131931876992          0     twitter   
...          ...                  ...        ...         ...   
6884        7469  1141766371686211968          0     twitter   
6885        3701                60637          1  55chan/pol   
6886        3628                60686          1  55chan/pol   
6887        4459  1141783291533677056          0     twitter   
6888        3453                62493          1  55chan/pol   

                                                    txt  
0     eu até pensei em não te responder, mas você é ...  
1                                  NOMEPROPRIO original  
2     terminei com a única mulher que já me amou nes...  

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'docno', 'has_anger', 'origin', 'txt', '__index_level_0__'],
        num_rows: 6889
    })
    test: Dataset({
        features: ['Unnamed: 0', 'docno', 'has_anger', 'origin', 'txt', '__index_level_0__'],
        num_rows: 362
    })
})

In [2]:
# # Importação do Dataset

# from torch.utils.data.dataset import random_split
# from datasets import load_dataset

# raw_datasets = load_dataset('csv', data_files={'train': 'df_dataset.csv', 'test':'df_dataset_test.csv'})

# raw_datasets

Using custom data configuration default-269d05c058f9d497
Reusing dataset csv (/home/arthurn/.cache/huggingface/datasets/csv/default-269d05c058f9d497/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'docno', 'has_anger', 'origin', 'txt'],
        num_rows: 7251
    })
    test: Dataset({
        features: ['Unnamed: 0', 'docno', 'has_anger', 'origin', 'txt'],
        num_rows: 421
    })
})

In [22]:
# Tratamento básico dos dados

train_texts = raw_datasets['train']['txt']
train_labels = raw_datasets['train']['has_anger']
test_texts = raw_datasets['test']['txt']
test_labels = raw_datasets['test']['has_anger']

print("TrainTexts Length: ", len(train_texts))
print("TrainLabels Length: ", len(train_labels))
print("TestTexts Length: ", len(test_texts))
print("TestLabels Length: ", len(test_labels))

# Removendo elementos None no texto e nas labels
elements_none = []
for x in range(len(test_texts)):
    if (test_texts[x] == None):
        elements_none.append(x)

for index_none in sorted(elements_none, reverse=True):
    test_texts.pop(index_none)
    test_labels.pop(index_none)
    
print("TestTexts Post Processing Length: ", len(test_texts))
print("TestLabels Post Processing Length: ", len(test_labels))

TrainTexts Length:  6889
TrainLabels Length:  6889
TestTexts Length:  362
TestLabels Length:  362
TestTexts Post Processing Length:  362
TestLabels Post Processing Length:  362


In [23]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.04)

In [24]:
print("TrainTexts Length: ", len(train_texts))
print("TrainLabels Length: ", len(train_labels))
print("ValidationTexts Length: ", len(val_texts))
print("ValidationLabels Length: ", len(val_labels))

TrainTexts Length:  6613
TrainLabels Length:  6613
ValidationTexts Length:  276
ValidationLabels Length:  276


In [25]:
# Importando o Tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [26]:
# Tokenização dos datasets

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [27]:
# Voltando os datasets tokenizados para instâncias da classe de Dataset

import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [24]:
# print(val_dataset.__getitem__(0))

In [28]:
# Fine Tuning do Modelo

from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AdamW

device = 'cpu'

model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.sum().backward() # loss.backward() # Tive que alterar, pois loss.backward() é, implicitamente, loss.backward(torch.Tensor([1]))
        # e no caso, deveria ser um vetor com mais de um elemento
        # https://discuss.pytorch.org/t/loss-backward-raises-error-grad-can-be-implicitly-created-only-for-scalar-outputs/12152
        optim.step()

model.eval()

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [29]:
# # Salvando o modelo Pre treinado

# model.save_pretrained("pretrained-model-bert-base-portuguese-cased-fine-tuning-1")

In [29]:
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from sklearn.metrics import classification_report

# kf = KFold(n_splits=10, shuffle=True)

# # print(val_texts, val_labels)

# for train_index, val_index in kf.split(val_texts, val_labels):
#     print(train_index)
#     print(val_index)
#     print()
# #     # splitting Dataframe (dataset not included)
# #     train_df = train_data.iloc[train_index]
# #     val_df = train_data.iloc[val_index]
# #     # Defining Model
# #     model = ClassificationModel('bert', 'bert-base-uncased') 
# #     # train the model
# #     model.train_model(train_df)
# #     # validate the model 
# #     result, model_outputs, wrong_predictions = model.eval_model(val_df, acc=accuracy_score)
# #     print(result['acc'])
# #     # append model score
# #     results.append(result['acc'])

# # print(cross_val_score(LogisticRegressionCV(random_state=42), x_train, y_train, cv=10, verbose=1, n_jobs=-1, scoring='recall').mean())


# print(classification_report([1, 0], [1, 1]))

[   0    1    2 ... 6958 6959 6960]
[   4    5   25   31   83   87  113  117  122  133  163  170  189  190
  194  196  226  228  234  255  256  262  265  268  271  273  279  287
  292  323  332  333  354  362  366  397  402  407  426  451  454  485
  501  525  532  556  558  571  577  585  598  601  605  616  622  623
  630  638  643  653  657  662  689  693  702  704  709  710  717  718
  719  722  727  733  737  744  745  757  769  789  807  811  817  829
  830  852  853  858  860  870  873  891  893  899  902  908  930  945
  952  958  962  965  969  974 1013 1014 1049 1057 1058 1069 1070 1088
 1089 1132 1140 1141 1142 1145 1155 1157 1185 1194 1206 1207 1221 1250
 1266 1268 1274 1278 1286 1303 1312 1314 1325 1328 1348 1351 1354 1366
 1385 1388 1389 1406 1414 1424 1429 1473 1480 1484 1485 1493 1504 1506
 1508 1509 1514 1520 1527 1533 1551 1556 1560 1576 1578 1583 1586 1591
 1592 1645 1652 1655 1665 1682 1690 1691 1703 1722 1724 1727 1733 1741
 1742 1750 1779 1781 1836 1872 1875 1877 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# import numpy as np

# np.concatenate(([1, 2, 3], [4, 5, 6])).reshape((2, 3))

ValueError: cannot reshape array of size 6 into shape (2,)

In [38]:
# Descobrindo acurácia do Modelo

from torch import nn

cm = {'true_positive': 0, 'true_negative': 0, 'false_positive': 0, 'false_negative': 0}
for i in range(len(val_texts)):
    tokenized_text = tokenizer(val_texts[i], truncation=True, padding=True, max_length=512)
    # print(entry)
    output = model(torch.tensor([tokenized_text.input_ids]))
    if (torch.argmax(output.logits, dim=-1) == val_labels[i]):
        if val_labels[i] == 1:
            cm['true_positive'] += 1
        else:
            cm['true_negative'] += 1
    else:
        if val_labels[i] == 0:
            cm['false_positive'] += 1
        else:
            cm['false_negative'] += 1
        
precision = cm['true_positive'] / (cm['true_positive'] + cm['false_positive'])
recall = cm['true_positive'] / (cm['true_positive'] + cm['false_negative'])
fscore = 2 * ((precision * recall) / (precision + recall))
acertos = cm['true_positive'] + cm['true_negative']
print(f'Acertos: {cm["true_positive"] + cm["true_negative"]}')
print(f'Total: {len(val_texts)}')
print(f'Acurácia: {(acertos / len(val_texts)) * 100:.4f}%')
print(f'Precision: {precision:.4f}%')
print(f'Recall: {recall:.4f}%')
print(f'Fscore: {fscore:.4f}%')

Acertos: 270
Total: 276
Acurácia: 97.8261%
Precision: 0.98%
Recall: 0.98%
Fscore: 0.98%


In [44]:
# Predição de textos específicos

text = 'Texto aqui'

tokenized_text = tokenizer(text, truncation=True, padding=True, max_length=512)

output = model(torch.tensor([tokenized_text.input_ids]))

print(torch.argmax(output.logits, dim=-1))

tensor([0])
