In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5Tokenizer

def load_config(max_length):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load the pre-trained T5 model and tokenizer
    model_name = 't5-small'
    model = T5ForConditionalGeneration.from_pretrained(model_name, max_length = max_length)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    
    return model, tokenizer, device


In [2]:
max_length = 128

data = pd.read_csv('../data_preprocess/datasets_combine.csv')
model, tokenizer, device = load_config(max_length)
train_df, val_df = train_test_split(data, test_size=0.1)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def tokenize_and_mask(batch, tokenizer, max_length):
    tokenized_input = tokenizer(batch['sentence'], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    labels = batch['label']
    return {
        'input_ids': tokenized_input['input_ids'].squeeze(0),
        'label': labels,
        'attention_mask': tokenized_input['attention_mask'].squeeze(0)
    }

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long).squeeze(0),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long).squeeze(0),
            'label': torch.tensor(item['label'], dtype=torch.long).squeeze(0)
    }


train_df = train_df.apply(lambda x: tokenize_and_mask(x, tokenizer, max_length), axis=1)
val_df = val_df.apply(lambda x: tokenize_and_mask(x, tokenizer, max_length), axis=1) 

train_dataset = TextDataset(train_df)
val_dataset = TextDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [4]:

# LSTM Classifier Model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return out

def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, dim=1)
            correct_predictions += (predicted == labels).sum().item()

    avg_loss = total_loss / len(val_loader)
    accuracy = correct_predictions / len(val_loader.dataset)
    return avg_loss, accuracy


# Model hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 2  # Adjust based on the number of target classes
num_epoch = 8

# Initialize T5 tokenizer
VOCAB_SIZE = tokenizer.vocab_size

# Initialize the model
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, OUTPUT_DIM).to(device)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epoch):  # Number of epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        predictions = model(input_ids)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    val_loss, val_accuracy = validate(model, val_loader, criterion, device)
    print(f'Epoch: {epoch + 1}, Loss: {total_loss / len(train_loader)}')
    print(f'Val Loss: {val_loss}, Val Accuracy: {val_accuracy}')

# Save the model
torch.save(model.state_dict(), 'lstm_model.pth')

  'input_ids': torch.tensor(item['input_ids'], dtype=torch.long).squeeze(0),
  'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long).squeeze(0),


Epoch: 1, Loss: 0.5652549087934129
Val Loss: 0.24599706262350082, Val Accuracy: 0.8979721166032953
Epoch: 2, Loss: 0.17251003382366722
Val Loss: 0.12069351803511381, Val Accuracy: 0.9657794676806084
Epoch: 3, Loss: 0.06930512590396747
Val Loss: 0.08063274560496211, Val Accuracy: 0.9746514575411914
Epoch: 4, Loss: 0.03701489381172258
Val Loss: 0.06226297674234957, Val Accuracy: 0.9790874524714829
Epoch: 5, Loss: 0.028627749046348537
Val Loss: 0.08449332262622193, Val Accuracy: 0.9759188846641318
Epoch: 6, Loss: 0.01458167491495391
Val Loss: 0.06574166533420794, Val Accuracy: 0.9797211660329531
Epoch: 7, Loss: 0.00638283007457823
Val Loss: 0.11056686891126447, Val Accuracy: 0.9746514575411914
Epoch: 8, Loss: 0.0050408316095166125
Val Loss: 0.07858338020130759, Val Accuracy: 0.9816223067173637


In [15]:
# input a sentence and get the prediction
def predict(encoding, model=model, device=device):
    model.eval()
    with torch.no_grad():
        input_ids = encoding['input_ids'].flatten().to(device)
        outputs = model(input_ids)

        # normalize the scores to all positive and sum to 1
        outputs = torch.nn.functional.softmax(outputs, dim=0)

        return outputs

# record maxinmum token id and minimum token id
max_token_id = 0
min_token_id = 100000
for text in data['sentence']:
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    for token_id in encoding['input_ids'].flatten():
        if token_id > max_token_id:
            max_token_id = token_id
        if token_id < min_token_id:
            min_token_id = token_id
    print(text)
    print(predict(encoding, model,device))

Claims she suffered catalogue of abuse at hands of Italian former partner
tensor([9.9970e-01, 2.9856e-04], device='cuda:0')
Six crew and 158 passengers evacuated from American Airlines flight
tensor([9.9973e-01, 2.7187e-04], device='cuda:0')
ISABELLA:O just but severe law!I had a brother, then
tensor([5.9676e-05, 9.9994e-01], device='cuda:0')
Prosecutors say the two claimed $340,000 intended for victims of the Gulf spill
tensor([9.9946e-01, 5.3655e-04], device='cuda:0')
Moyes was sacked by Premier League club in April after a mediocre season
tensor([9.9975e-01, 2.5308e-04], device='cuda:0')
She says going shopping is ‘like being Julia Roberts in Pretty Woman'China is the number one international market to visit Westfield London
tensor([9.9974e-01, 2.5961e-04], device='cuda:0')
House expected to debate bill Wednesday and vote on it Thursday
tensor([9.9972e-01, 2.7658e-04], device='cuda:0')
CATESBY:My lord!KING RICHARD III:Good news or bad, that thou comest in so bluntly?CATESBY:Bad news

In [16]:
max_token_id

tensor(31997)

In [17]:
min_token_id

tensor(0)

In [18]:
VOCAB_SIZE

32000