In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import copy
import time
from models import MLP
from get_embedded_data import get_data_tokenizer_MLP, split_data, MAPPING


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda")
device

device(type='cuda')

In [3]:
from transformers import BertModel, ElectraModel, RobertaModel
from transformers import BertTokenizer, ElectraTokenizer, RobertaTokenizer


bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

robert_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
robert_model = RobertaModel.from_pretrained('roberta-base')

electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
electra_model = ElectraModel.from_pretrained('google/electra-small-discriminator')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def train(max_epoch, network, train_dataloader, test_dataloader, optimizer, criterion):
    train_data_df = []

    network.train()
    best_network = network
    best_accuracy = 0
    dict_for_stat = {
        0: [0,0,0],
        1: [0,0,0],
        2: [0,0,0],
        3: [0,0,0],
        4: [0,0,0],
        5: [0,0,0],
        6: [0,0,0],
        7: [0,0,0],
        8: [0,0,0]
    }
    for epoch in range(max_epoch):

        running_loss = 0.0
        correct = 0
        total = 0
        for i, data in enumerate(train_dataloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            inputs = torch.flatten(inputs.double(), 1)
            outputs = network(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()


            total += labels.size(0)
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

            running_loss += loss.item()


        val_correct = 0
        val_total = 0
        for i, val_data in enumerate(test_dataloader, 0):
            val_inputs, val_labels = val_data
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

            val_inputs = torch.flatten(val_inputs.double(), 1)
            val_outputs = network(val_inputs)

            val_total += val_labels.size(0)
            _, val_predicted = torch.max(val_outputs.data, 1)
            val_correct += (val_predicted == val_labels).sum().item()
        if round(100 * val_correct / val_total, 3) > best_accuracy:
            best_accuracy = round(100 * val_correct / val_total, 3)
            best_network = copy.deepcopy(network)

        train_loss = running_loss / 2000
        train_accuracy = round(100 * correct / total, 3)
        val_accuracy = round(100 * val_correct / val_total, 3)
        train_data_df.append([epoch+1, max_epoch, round(train_loss, 3), train_accuracy, val_accuracy])

        running_loss = 0.0

    correct = 0
    total = 0
    all_labels = []
    all_predictions = []
    start_time = time.time()
    for i, data in enumerate(test_dataloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        inputs = torch.flatten(inputs.double(), 1)
        outputs = best_network(inputs)


        total += labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

        for pr, lab in zip(predicted, labels):
            pr, lab = pr.item(), lab.item()
            if pr == lab:
                # TP
                dict_for_stat[pr][0] += 1
                continue
            # FN
            dict_for_stat[lab][1] += 1
            # FP
            dict_for_stat[pr][2] += 1
    pred_time = start_time - time.time()

    pr_rec_f1 = {}
    for key in dict_for_stat.keys():
        tp, fn, fp = dict_for_stat[key]
        precision = -1 if tp+fp == 0 else tp/(tp+fp)
        recall = -1 if tp+fn == 0 else tp/(tp+fn)
        f1_score = -1 if tp+fn+fp == 0 else 2*tp/(2*tp+fn+fp)
        pr_rec_f1[key] = [precision, recall, f1_score]

    final_accuracy = round(100 * correct / total, 3)

    return best_network, final_accuracy, best_accuracy, all_labels, all_predictions, pr_rec_f1, pred_time, pd.DataFrame(train_data_df, columns=["Epoch", "Max Epoch", "loss", "train data accuracy", "test data accuracy"])


In [5]:
models_list = [("bert", bert_tokenizer, bert_model), ("roberta", robert_tokenizer, robert_model), ("electra", electra_tokenizer, electra_model)]
labels_to_delete_list = [[], ['spinoza', "hegel", "plato"], ['aristotle', 'freud', 'kant', 'nietzsche', 'sartre', 'schopenhauer']]
weigth_list = [None, torch.tensor([1,1,2,3,1.33,1.33,5,1,2.5],dtype=torch.double).to(device)]

full_data = []
for labels_to_delete in labels_to_delete_list:
    X_train, X_test, y_train, y_test = split_data("data_set.csv", "author", "quote", test_size=0.2, separator="@", mapping=MAPPING,
                                                labels_to_delete=labels_to_delete)
    for name, tokenizer, model in models_list:
        train_dataloader, shape = get_data_tokenizer_MLP(batch=20*5, words=X_train, labels=y_train, device=device, tokenizer=tokenizer, model=model)
        test_dataloader, shape = get_data_tokenizer_MLP(batch=1, words=X_test, labels=y_test, device=device, tokenizer=tokenizer, model=model)
        for weigths in weigth_list:
            network = MLP(shape*125, 125*2, 125, 9, dropout=0).to(device)
            criterion = nn.CrossEntropyLoss(weight=weigths)
            optimizer = optim.Adam(network.parameters())
            best_network, test_acc, best_accuracy, all_labels, all_predictions, pr_rec_f1, pred_time, train_data = train(max_epoch=20, network=network, train_dataloader=train_dataloader, test_dataloader=test_dataloader, optimizer=optimizer, criterion=criterion)
            weigths = [] if weigths is None else [1,1,2,3,1.33,1.33,5,1,2.5]
            torch.save(best_network, f"train_models/{name}_{labels_to_delete}_{weigths}.pt")
            train_data.to_csv(f"train_data/{name}_{labels_to_delete}_{weigths}.csv")
            full_data.append([name, labels_to_delete, weigths, test_acc, best_accuracy, all_labels, all_predictions, pr_rec_f1, pred_time])
full_data_df = pd.DataFrame(full_data, columns=["name", "labels_to_delete", "weigths", "test_acc", "best_accuracy", "all_labels", "all_predictions", "pr_rec_f1", "pred_time"])
full_data_df.to_csv("train_data/full_data.csv")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['aristotle' 'freud' 'hegel' 'kant' 'nietzsche' 'plato' 'sartre'
 'schopenhauer' 'spinoza']


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
# cm = confusion_matrix([[MAPPING[x] for x in all_labels], [MAPPING[x] for x in all_predictions])

# plt.figure(figsize=(10,7))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()