In [None]:
!pip install portalocker

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [None]:
import torch
from   torchtext.datasets        import AG_NEWS
from   torchtext.data.functional import to_map_style_dataset
import matplotlib.pyplot         as plt
import seaborn                   as sbn

# IMPORT NECESSARI PER CREARE IL VOCABOLARIO
from torchtext.data              import get_tokenizer
from torchtext.vocab             import build_vocab_from_iterator

from torch.utils.data            import DataLoader
from torch                       import nn
from torch.nn                    import functional as F

# PER IL CALCOLO DELLA LOSS E DELL'ACCURACY
from sklearn.metrics            import accuracy_score
from tqdm                       import tqdm


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Lettura dati dal dataset AG_NEWS

Tale dataset contiene una collezione di articoli generati da 2000 news in più di una anno di attività. Esso è stato definito per scopi di ricerca in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity.

Le categorie presenti in tale dataset sono:
 - World
 - Sports
 - Business
 - Sci/Tech

In [None]:
# NOTA
# to_map_style_dataset: converte iterable-style dataset a map-style dataset.
agnews_train, agnews_test = AG_NEWS(split=('train', 'test'))
agnews_train, agnews_test = to_map_style_dataset(agnews_train), to_map_style_dataset(agnews_test)

**Contiamo quanti record ci sono all'interno del train e del test. Inoltre qual'è la frequenza di ogni classe sia nel train che nel test**

In [None]:
categories =  ["World", "Sports", "Business", "Sci/Tech"]

# TRAIN
train_labels       = [label for label, _ in agnews_train]
train_labels_nodup = set(train_labels)

# TEST
test_labels       = [label for label, _ in agnews_test]
test_labels_nodup = set(test_labels)

In [None]:
print("Domensione del train: ", len(train_labels))
print("Domensione del test: " , len(test_labels))

Domensione del train:  120000
Domensione del test:  7600


In [None]:
def freq_computing(labels_vet):
  labels_freq = {}
  for label in labels_vet:
    if label not in labels_freq:
      labels_freq [label] = 0
    labels_freq [label] += 1
  return labels_freq

train_labels_freq = freq_computing(train_labels)
test_labels_freq  = freq_computing(test_labels)

print("Frequenza etichette del train: ", train_labels_freq)
print("Frequenza etichette del test: " , test_labels_freq)

Frequenza etichette del train:  {3: 30000, 4: 30000, 2: 30000, 1: 30000}
Frequenza etichette del test:  {3: 1900, 4: 1900, 2: 1900, 1: 1900}


Ciò che faremo adesso è definire il tokenizer (splitta testi in words), e successivamente implementeremo una funzione che, a partire dai testi di train e test crea il vocabolario di termini.

Nota:
 - min_freq: specifica che soltanto le parole con frequenza > 1 verranno mantenute
 - specials: usato per associare il simbolo indicato nel caso in cui l'elemento non è presente

In [None]:
tokenizer = get_tokenizer("basic_english")

def vocabulary_creation(dataset_list):
  for dataset in dataset_list:
    for _, text in dataset:
      yield tokenizer(text)

datasets = [agnews_train, agnews_test]
vocab    = build_vocab_from_iterator(
    vocabulary_creation(datasets),
    min_freq = 1,
    specials = ["<UNK>"]
)

vocab.set_default_index(vocab["<UNK>"])
print("Dimensione del dizionario: ", len(vocab))

Dimensione del dizionario:  98635


In [None]:
# Esempio di tockenizer
tokens  = tokenizer("Hello how are you?, Welcome to Google Colaboratory")
indexes = vocab(tokens)

print("Tokens: ", tokens)
print("Indexes: ", indexes)

Tokens:  ['hello', 'how', 'are', 'you', '?', ',', 'welcome', 'to', 'google', 'colaboratory']
Indexes:  [12388, 355, 42, 164, 80, 3, 3298, 4, 202, 0]


INIZIO

CASO 1:
Trasformiamo i vari record deldataset in un vettore di 25 token indicizzati tramite il vocabilario creato.
Pertanto:
 - Se il record ha 25 token, la funzione si occupa di trasformare il vettore di token in un vettore di indici (presenti nel vocabolario)
 - Se ne ha più di 25, fa un troncamento a 25 e poi effettua l'operazione fatta nel punto precedente.
 - Se ne ha meno, allunga il vettore a 25 (inserendo zeri) e poi fa quanto detto nel primo punto.

In [None]:
# Esperimento 1
max_words = 25

def vectorize_batch(batch, max_words):
    # NOTA:
    # batch            : [(c1, t1), (c2, t2), (c1, t3)]
    # list(zip(*batch)): [(c1, c2, c1), (t1, t2, t3)]
    Y, X = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X] ## Tokenize and map tokens to indexes
    X = [
           tokens + ([0]* (max_words-len(tokens))) if len(tokens)<max_words else
           tokens[:max_words] for tokens in X
        ]

    # Il -1 sta ad indicare che le label partiranno da 0 a e non da 1 a
    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y) - 1


train_loader = DataLoader(agnews_train, batch_size=1024, collate_fn=lambda batch: vectorize_batch(batch, max_words), shuffle=True, pin_memory=True)
test_loader  = DataLoader(agnews_test , batch_size=1024, collate_fn=lambda batch: vectorize_batch(batch, max_words), pin_memory=True)

CREAZIONE DELLA RETE

 - Embedding Layer:
L' Embedding Layer prende in ingresso il vocabolario di termini, ed associa un vettore di numeri di lunghezza pari a embedding_lenght.
Quando passeremo il batch (batch_size, max_tokens) in ingresso all' embedding layer, otterremo in uscita un tensore di dimensioni (batch_size, max_tokens, embed_len)  Ad ogni token sarà associato il rispettivo embedding vector.

 - LSTM Layer
Prende in imput gli embedding generati dal livello precedente. Quindi, l' input di tale livello è (batch_size, max_tokens, embed_len), mentre l'output sarà (batch_size, max_tokens, hidden_dim).


In [None]:
embed_len  = 50
hidden_dim = 75
n_layers   = 1
device     = torch.device('cuda')

class LSTMClassifier(nn.Module):
    def __init__(self, device):
        super(LSTMClassifier, self).__init__()
        self.device          = device
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.lstm            = nn.LSTM(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear          = nn.Linear(hidden_dim, len(categories))

    def forward(self, X_batch):
        embeddings           = self.embedding_layer(X_batch)
        hidden_carry         = self.init_hidden_carry(self.device, X_batch)
        output, hidden_carry = self.lstm(embeddings, hidden_carry)
        return self.linear(output[:,-1])

    def init_hidden_carry(self, device, X_batch):
      hidden = torch.randn(n_layers, len(X_batch), hidden_dim)
      carry  = torch.randn(n_layers, len(X_batch), hidden_dim)
      hidden = hidden.to(device, non_blocking=True)
      carry  = carry.to(device,  non_blocking=True)
      return (hidden, carry)

lstm_classifier = LSTMClassifier(device)
lstm_classifier

LSTMClassifier(
  (embedding_layer): Embedding(98635, 50)
  (lstm): LSTM(50, 75, batch_first=True)
  (linear): Linear(in_features=75, out_features=4, bias=True)
)

**Definizione di funzioni che verranno utilizzati per valutare la bontà del modello (loss e accuracy)**

In [None]:
def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    # with torch.no_grad():
    Y_shuffled, Y_preds, losses = [],[],[]
    for X, Y in val_loader:
        preds = model(X)
        loss = loss_fn(preds, Y)
        losses.append(loss.item())

        Y_shuffled.append(Y)
        Y_preds.append(preds.argmax(dim=-1))

    Y_shuffled = torch.cat(Y_shuffled)
    Y_preds    = torch.cat(Y_preds)

    print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
    accuracy = torch.tensor(torch.sum(Y_preds == Y_shuffled).item() / len(Y_preds))
    print("Valid Acc  : {:.3f}".format(accuracy))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)
            loss    = loss_fn(Y_preds, Y) ## Calculate Loss
            losses.append(loss.item())

            optimizer.zero_grad()         ## Clear previously calculated gradients
            loss.backward()               ## Calcola il gradiente
            optimizer.step()              ## Aggiorna i pesi della rete.

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

**Spostiamo i dati sulla GPU**

In [None]:
def to_device(data, device):
    # Sposta i tensori sul device selezionato
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    # Dataloader per caricare i dati sul device
    def __init__(self, dl, device):
        self.dl     = dl
        self.device = device

    def __iter__(self):
        # Sposta il batch sul device
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        # Numero di batch
        return len(self.dl)

train_loader    = DeviceDataLoader(train_loader, device)
test_loader     = DeviceDataLoader(test_loader,  device)
lstm_classifier = to_device(lstm_classifier, device);

**Addestriamo**

In [None]:
from torch.optim import Adam

epochs        = 10
learning_rate = 1e-3
loss_fn       = nn.CrossEntropyLoss()
optimizer     = Adam(lstm_classifier.parameters(), lr=learning_rate)

TrainModel(lstm_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|██████████| 118/118 [00:05<00:00, 19.85it/s]


Train Loss : 0.522
Valid Loss : 0.503
Valid Acc  : 0.816


100%|██████████| 118/118 [00:04<00:00, 23.87it/s]


Train Loss : 0.432
Valid Loss : 0.453
Valid Acc  : 0.837


100%|██████████| 118/118 [00:04<00:00, 23.72it/s]


Train Loss : 0.376
Valid Loss : 0.419
Valid Acc  : 0.854


100%|██████████| 118/118 [00:06<00:00, 19.51it/s]


Train Loss : 0.332
Valid Loss : 0.396
Valid Acc  : 0.860


100%|██████████| 118/118 [00:04<00:00, 23.84it/s]


Train Loss : 0.300
Valid Loss : 0.385
Valid Acc  : 0.868


100%|██████████| 118/118 [00:06<00:00, 19.25it/s]


Train Loss : 0.271
Valid Loss : 0.366
Valid Acc  : 0.869


100%|██████████| 118/118 [00:04<00:00, 24.29it/s]


Train Loss : 0.246
Valid Loss : 0.363
Valid Acc  : 0.873


100%|██████████| 118/118 [00:06<00:00, 19.15it/s]


Train Loss : 0.226
Valid Loss : 0.362
Valid Acc  : 0.875


100%|██████████| 118/118 [00:04<00:00, 24.90it/s]


Train Loss : 0.206
Valid Loss : 0.363
Valid Acc  : 0.878


100%|██████████| 118/118 [00:05<00:00, 22.74it/s]


Train Loss : 0.188
Valid Loss : 0.359
Valid Acc  : 0.882


In [None]:
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)


    Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)
    return Y_shuffled.detach().cpu().numpy(), Y_preds.argmax(dim=-1).detach().cpu().numpy()

Y_actual, Y_preds = MakePredictions(lstm_classifier, test_loader)

In [None]:
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=categories))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

Test Accuracy : 0.8825

Classification Report : 
              precision    recall  f1-score   support

       World       0.91      0.88      0.89      1900
      Sports       0.93      0.95      0.94      1900
    Business       0.86      0.83      0.84      1900
    Sci/Tech       0.84      0.87      0.85      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600


Confusion Matrix : 
[[1665   67   79   89]
 [  31 1812   30   27]
 [  76   43 1582  199]
 [  63   32  157 1648]]


CASO 2:
 - max_words = 50

In [None]:
max_words    = 50

train_loader = DataLoader(agnews_train, batch_size=1024, collate_fn=lambda batch: vectorize_batch(batch, max_words), shuffle=True, pin_memory=True)
test_loader  = DataLoader(agnews_test , batch_size=1024, collate_fn=lambda batch: vectorize_batch(batch, max_words), pin_memory=True)

In [None]:
train_loader    = DeviceDataLoader(train_loader, device)
test_loader     = DeviceDataLoader(test_loader,  device)
lstm_classifier = to_device(lstm_classifier, device);

In [None]:
epochs        = 10
learning_rate = 1e-3
loss_fn       = nn.CrossEntropyLoss()
optimizer     = Adam(lstm_classifier.parameters(), lr=learning_rate)

TrainModel(lstm_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|██████████| 118/118 [00:05<00:00, 22.27it/s]


Train Loss : 0.224
Valid Loss : 0.346
Valid Acc  : 0.894


100%|██████████| 118/118 [00:06<00:00, 17.96it/s]


Train Loss : 0.181
Valid Loss : 0.333
Valid Acc  : 0.900


100%|██████████| 118/118 [00:05<00:00, 22.56it/s]


Train Loss : 0.160
Valid Loss : 0.339
Valid Acc  : 0.896


100%|██████████| 118/118 [00:06<00:00, 18.21it/s]


Train Loss : 0.145
Valid Loss : 0.335
Valid Acc  : 0.899


100%|██████████| 118/118 [00:05<00:00, 22.66it/s]


Train Loss : 0.133
Valid Loss : 0.326
Valid Acc  : 0.902


100%|██████████| 118/118 [00:06<00:00, 17.81it/s]


Train Loss : 0.122
Valid Loss : 0.339
Valid Acc  : 0.900


100%|██████████| 118/118 [00:05<00:00, 22.71it/s]


Train Loss : 0.112
Valid Loss : 0.342
Valid Acc  : 0.898


100%|██████████| 118/118 [00:06<00:00, 18.26it/s]


Train Loss : 0.106
Valid Loss : 0.363
Valid Acc  : 0.896


100%|██████████| 118/118 [00:05<00:00, 22.31it/s]


Train Loss : 0.097
Valid Loss : 0.351
Valid Acc  : 0.900


100%|██████████| 118/118 [00:06<00:00, 18.54it/s]


Train Loss : 0.089
Valid Loss : 0.368
Valid Acc  : 0.896


In [None]:
Y_actual, Y_preds = MakePredictions(lstm_classifier, test_loader)
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=categories))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

Test Accuracy : 0.8963157894736842

Classification Report : 
              precision    recall  f1-score   support

       World       0.91      0.90      0.91      1900
      Sports       0.96      0.95      0.96      1900
    Business       0.84      0.88      0.86      1900
    Sci/Tech       0.87      0.86      0.87      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Confusion Matrix : 
[[1708   33   79   80]
 [  42 1802   36   20]
 [  71   19 1673  137]
 [  48   14  209 1629]]
