In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, BertTokenizer, XLNetTokenizer, XLNetForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score
import os
import time
from classes import *

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.version.cuda)

In [None]:
# Used to mark differnt versions of models as script is updated
version = 1

In [None]:
#bert-base-uncased
#distilbert-base-uncased
#xlnet-base-uncased
#roberta-base
#distilroberta-base

# List which models to train
model_list = ['xlnet-base-uncased','bert-base-uncased','distilbert-base-uncased',
              'roberta-base','distilroberta-base']

In [None]:
#load the data
train_raw,test_raw,train_label_raw,test_label_raw=[],[],[],[]
with open('../input/train.ft.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in tqdm(lines):
    train_raw.append(line.split('__label__')[1][1:])
    train_label_raw.append(line.split('__label__')[1][0])
with open('../input/test.ft.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in tqdm(lines):
    test_raw.append(line.split('__label__')[1][1:])
    test_label_raw.append(line.split('__label__')[1][0])

In [None]:
#convert data to a dataframe
train = pd.DataFrame({'text': train_raw, 'target': train_label_raw})
test = pd.DataFrame({'text': test_raw, 'target': test_label_raw})
#turn targets into ints
train['target'] = train['target'].astype(int)
test['target'] = test['target'].astype(int)
target_map = {1:0, 2:1}
train['target'] = train['target'].replace(target_map)
test['target'] = test['target'].replace(target_map)

In [None]:
# Class moved to classes.py
'''class amazon_dataset(Dataset):
    def __init__(self, encoded_data, labels):
        self.input_ids = encoded_data["input_ids"]
        self.attention_mask = encoded_data["attention_mask"]
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to tensor

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }'''

In [None]:
#set length for training data, this also impacts test length
train_len = 1000

train_text = train['text'].to_list()[:train_len]
test_text = test['text'].to_list()[:int(train_len/8)]
train_targets = train['target'].to_list()[:train_len]
test_targets = test['target'].to_list()[:int(train_len/8)]

In [None]:
plt.hist(train_targets, bins=2)
plt.show()

In [None]:
results = {}
time_dict = {}

epochs = 1
batch_size = 32

# Set params for grid search
param_grid = {
    'lr': [5e-5, 4e-5, 3e-5, 2e-5],
    'dropout_rate': [0.1, 0.2, 0.3],
    # Add more hyperparameters as needed
}

total_models_per_model = len(param_grid['lr']) * len(param_grid['dropout_rate'])

id = 0

for model_id in model_list:

    print(model_id)

    time_list = []

    start_time = time.time()

    if model_id == 'bert-base-uncased':
        tokenizer = BertTokenizer.from_pretrained(model_id)
        model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2)
    elif model_id == 'distilbert-base-uncased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_id)
        model = DistilBertForSequenceClassification.from_pretrained(model_id, num_labels=2)
        
    elif model_id == 'xlnet-base-uncased':
        tokenizer = XLNetTokenizer.from_pretrained(model_id)
        model = XLNetForSequenceClassification.from_pretrained(model_id, num_labels=2)

    elif model_id == 'roberta-base':
        tokenizer = RobertaTokenizer.from_pretrained(model_id)
        model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=2)
    elif model_id == 'distilroberta-base':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    else:
        raise Exception('Model not recognised')

    #set the processor to the best avalible option
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print(f'Using: {device}')
    model.to(device)

    progress_bar = tqdm(range(total_models_per_model))
    #progress_bar = tqdm(len(train_dataloader))
    
    for lr in param_grid['lr']:
        for dropout_rate in param_grid['dropout_rate']:

            # Set your hyperparameters
            learning_rate = lr
            batch_size = batch_size
            num_epochs = epochs
            dropout_prob = dropout_rate

            # Assuming your amazon_dataset class takes care of loading data and encoding
            train_text_encoded = tokenizer.batch_encode_plus(train_text, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt', max_length=128, return_attention_mask=True)
            test_text_encoded = tokenizer.batch_encode_plus(test_text, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt', max_length=128, return_attention_mask=True)

            train_dataset = amazon_dataset(train_text_encoded, train_targets)
            test_dataset = amazon_dataset(test_text_encoded, test_targets)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

            # Define the optimizer and loss function
            optimiser = AdamW(params=model.parameters(), lr=learning_rate)
            criterion = torch.nn.CrossEntropyLoss()

            model.train()

            start_time = time.time()

            # Train the current model
            for epoch in range(epochs):
                for batch in train_dataloader:
                    inputs = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    optimiser.zero_grad()
                    outputs = model(inputs, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    loss.backward() 
                    optimiser.step()
            end_time = time.time()

            elapsed_time = end_time - start_time

            time_list.append(elapsed_time)

            # Evaluate the model on the test set and save the hyperparameters and performance
            # (you need to implement the evaluation part based on your dataset and task)
            train_preds, train_true, test_preds, test_true = [], [], [], []

            # Make training set predictions
            for batch in train_dataloader:
                with torch.no_grad():
                    inputs = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    model.eval()

                    output = model(inputs, attention_mask=attention_mask)

                    logits = output.logits
                    predictions = torch.argmax(logits, dim=-1)
                    predictions = predictions.cpu()

                    train_preds.append(predictions.numpy())
                    labels = labels.cpu()
                    train_true.append(labels.numpy())
            
            # Make testing set predictions
            for batch in test_dataloader:
                with torch.no_grad():
                    inputs = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    model.eval()

                    output = model(inputs, attention_mask=attention_mask)

                    logits = output.logits
                    predictions = torch.argmax(logits, dim=-1)
                    predictions = predictions.cpu()

                    test_preds.append(predictions.numpy())
                    labels = labels.cpu()
                    test_true.append(labels.numpy())

            # Flatten predictions
            train_flat_preds = np.concatenate([batch_preds for batch_preds in train_preds])
            train_flat_true = np.concatenate([batch_true for batch_true in train_true])
            test_flat_preds = np.concatenate([batch_preds for batch_preds in test_preds])
            test_flat_true = np.concatenate([batch_true for batch_true in test_true])

            # Get the training and testing accuracies
            train_accuracy = accuracy_score(train_flat_true, train_flat_preds)
            test_accuracy = accuracy_score(test_flat_true, test_flat_preds)

            print(f'lr={learning_rate}, batch_size={batch_size} dropout={dropout_prob}, train accuracy={train_accuracy}, test accuracy={test_accuracy}')

            model_name = f'{str(model_id)}_{str(learning_rate)}_{str(batch_size)}_{str(dropout_prob)}'

            # Save the results to a dictionary
            results[id] = {'model':model_id, 'Learning Rate': learning_rate, 'Dropout Probability': dropout_prob, 'Train Accuracy': train_accuracy, 'test Accuracy': test_accuracy}

            # Create directory for models
            output_dir = f'../modelsV2/{model_name}'
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            # Save models
            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            # Track progress of for loop
            progress_bar.update(1)
            id = id+1
    time_dict[model_id] = sum(time_list)/len(time_list)

In [None]:
print(model_name)

In [None]:
# Save all results to dataframes
results_df = pd.DataFrame(results)
time_df = pd.DataFrame(time_dict)

In [None]:
# Save dataframes as CSVs
results_df.to_csv(f'results/base/resultsV{version}.csv')
time_df.to_csv(f'results/base/timeV{version}.csv')