In [2]:
import random
import spacy
nlp = spacy.load("en_core_web_sm")
import torch
import torchtext
import pandas as pd


In [None]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "philschmid/flan-t5-xxl-sharded-fp16"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

Downloading config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`

In [3]:
BATCH_SIZE = 32
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [29]:
TEXT = torchtext.data.Field(init_token="<bos>", eos_token="<eos>", tokenize=lambda text: [token.text for token in nlp(text)], include_lengths=True, lower=True, batch_first=True)
LABEL = torchtext.data.LabelField(sequential=False, dtype=torch.float, batch_first=True)
fields = [('label', LABEL), ('text', TEXT)]

data = torchtext.data.TabularDataset(path="/home/moh7596/Dataset/SMS_ Spam_Ham_Prediction/sms_spam.csv",
                                     format="CSV",
                                     fields=fields,
                                     skip_header=True)

training_data, validation_data, testing_data = data.split([0.7, 0.15, 0.15], random_state=random.seed(2023))

training_iterator = torchtext.data.BucketIterator(dataset=training_data,
                                                  batch_size=BATCH_SIZE,
                                                  sort_key=lambda x: len(x.text),
                                                  device=DEVICE,
                                                  shuffle=True,
                                                  sort_within_batch=True,
                                                  sort=False)
                                                  
validation_iterator = torchtext.data.BucketIterator(dataset=validation_data,
                                                    batch_size=BATCH_SIZE,
                                                    sort_key=lambda x: len(x.text),
                                                    device=DEVICE,
                                                    shuffle=True,
                                                    sort_within_batch=True,
                                                    sort=False)

testubg_iterat = torchtext.data.BucketIterator(dataset=testing_data,
                                               batch_size=BATCH_SIZE,
                                               sort_key=lambda x: len(x.text),
                                               device=DEVICE,
                                               shuffle=True,
                                               sort_within_batch=True,
                                               sort=False)

print(vars(training_data.examples[0]))
print(vars(validation_data.examples[0]))
print(vars(testing_data.examples[0]))

{'label': 'ham', 'text': ['wat', 'makes', 'some', 'people', 'dearer', 'is', 'not', 'just', 'de', 'happiness', 'dat', 'u', 'feel', 'when', 'u', 'meet', 'them', 'but', 'de', 'pain', 'u', 'feel', 'when', 'u', 'miss', 'dem', '!', '!', '!']}
{'label': 'ham', 'text': ['sat', 'right', '?', 'okay', 'thanks', '...']}
{'label': 'ham', 'text': ['when', 'you', 'and', 'derek', 'done', 'with', 'class', '?']}


In [42]:
vectors = torchtext.vocab.Vectors(name="/home/moh7596/Dataset/WordEmbedding/glove.6B/glove.6B.300d.txt")

TEXT.build_vocab(training_data, min_freq=1)
LABEL.build_vocab(training_data)
print("Size of TEXT vocabulary: ", len(TEXT.vocab))

print(TEXT.vocab["<eos>"])

Size of TEXT vocabulary:  7816
3


In [55]:
class LSTM(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTM, self).__init__()
        #self.embedding = torch.nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        #self.embedding.weight.data.copy_(vectors.vectors)
        self.embedding = torch.nn.Embedding.from_pretrained(vectors.vectors)
        self.embedding.weight.requires_grad = False
        self.lstm = torch.nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout)
        self.fc =  torch.nn.Linear(2*hidden_dim, output_dim)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, text, text_length):
        x = self.embedding(text)
        x = torch.nn.utils.rnn.pack_padded_sequence(x, text_length, batch_first=True)
        output, (h_n, c_n) = self.lstm(x)

        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        hidden = self.fc(hidden)
        x = self.sigmoid(hidden)

        return x
        
        
vocab_size = len(TEXT.vocab)
embedding_dim = 300
hidden_dim = 64
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0

model = LSTM(vocab_size=vocab_size,
             embedding_dim=embedding_dim,
             hidden_dim=hidden_dim,
             output_dim=output_dim,
             n_layers=n_layers,
             bidirectional=bidirectional,
             dropout=dropout
             ).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.BCELoss().to(DEVICE)
model


LSTM(
  (embedding): Embedding(400000, 300)
  (lstm): LSTM(300, 64, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [56]:
def accuracy(y_hat, y_true):
    #round y_hat to the closest integer (y_hat is an array of float numbers ranging from 0 to 1, y_true is an array of integer numbers 0 or 1)
    return (torch.round(y_hat) == y_true).float().sum().item()


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    batch_acc = 0
    num_data = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        ## That is why you we set "include_length=True", becasue this length is used in the bucket iterator and pack_padded_sequence
        text, text_length = batch.text
        text = text.to(DEVICE)
        text_length = text_length.to('cpu')
        y_predict = model(text, text_length).squeeze()
        loss = criterion(y_predict, batch.label)
        loss.backward()
        optimizer.step()

        acc = accuracy(y_predict, batch.label)

        batch_acc += acc
        epoch_loss += loss.item()
        num_data += y_predict.shape[0]

    return epoch_loss, batch_acc/num_data


def evaluation(model, iterator, criterion):
    num_data = 0
    test_loss = 0
    test_acc = 0
    model.eval()
    for batch in iterator:
        text, text_length = batch.text
        text_length = text_length.to('cpu')
        y_predict = model(text, text_length).squeeze()
        
        loss = criterion(y_predict, batch.label)
        acc = accuracy(y_predict, batch.label)
        test_loss += loss.item()
        test_acc += acc

        num_data += y_predict.shape[0]

    return test_loss, test_acc/num_data

In [57]:
EPOCH = 15
for epoch in range(EPOCH):
    train_loss, train_acc = train(model, training_iterator, optimizer, criterion)
    test_loss, test_acc = evaluation(model, testubg_iterat, criterion)

    print(f"EPOCH: %2d \t Train Loss: %.3f \t Test loss: %.3f \t Test Accuracy: %.3f" % (epoch, train_loss, test_loss, test_acc))


EPOCH:  0 	 Train Loss: 61.626 	 Test loss: 10.635 	 Test Accuracy: 0.859
EPOCH:  1 	 Train Loss: 46.873 	 Test loss: 10.315 	 Test Accuracy: 0.859
EPOCH:  2 	 Train Loss: 44.244 	 Test loss: 9.699 	 Test Accuracy: 0.859
EPOCH:  3 	 Train Loss: 36.302 	 Test loss: 6.562 	 Test Accuracy: 0.859
EPOCH:  4 	 Train Loss: 25.484 	 Test loss: 5.289 	 Test Accuracy: 0.932
EPOCH:  5 	 Train Loss: 18.486 	 Test loss: 4.431 	 Test Accuracy: 0.947
EPOCH:  6 	 Train Loss: 16.181 	 Test loss: 4.370 	 Test Accuracy: 0.947
EPOCH:  7 	 Train Loss: 12.852 	 Test loss: 4.058 	 Test Accuracy: 0.957
EPOCH:  8 	 Train Loss: 11.420 	 Test loss: 3.465 	 Test Accuracy: 0.963
EPOCH:  9 	 Train Loss: 10.955 	 Test loss: 4.997 	 Test Accuracy: 0.952
EPOCH: 10 	 Train Loss: 9.190 	 Test loss: 3.289 	 Test Accuracy: 0.963
EPOCH: 11 	 Train Loss: 7.974 	 Test loss: 3.861 	 Test Accuracy: 0.964
EPOCH: 12 	 Train Loss: 8.827 	 Test loss: 3.649 	 Test Accuracy: 0.963
EPOCH: 13 	 Train Loss: 7.121 	 Test loss: 3.913 	 T