## Notebook for exploring finetuning using Flair

In [1]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import tqdm
import csv
from torch.optim import AdamW
from collections import Counter

import logging
# set the logging level to INFO
logging.basicConfig(level=logging.INFO)

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from flair.datasets import ClassificationCorpus
from flair.embeddings import (
    FlairEmbeddings,
    DocumentRNNEmbeddings,
    TransformerWordEmbeddings,
    TransformerDocumentEmbeddings
)
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

import flair
flair.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {flair.device}")


Using device: cpu


## Data Preparation

In [2]:
class DataProcessor:
    '''Data processor to ensure datasets adhere to Flair-compatible format.'''
    
    def __init__(self, output_dir: str):
        """
        initializes the DataProcessor class with the output directory.
        args:
            output_dir (str): Directory to save processed files.
        """
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def read_imdb(self, data_dir: str, is_train: bool) -> list:
        """
        reads IMDb dataset and returns it in Flair-compatible format.
        args:
            data_dir (str): Path to the IMDb dataset directory.
            is_train (bool): If True, process training data; else, process test data.
        returns:
            list: List of sentences with labels in Flair format.
        """
        data_folder = 'train' if is_train else 'test'
        data = []

        for label_folder in ['neg', 'pos']:
            path = os.path.join(data_dir, data_folder, label_folder)
            label = '__label__NEG' if label_folder == 'neg' else '__label__POS'

            for file in os.listdir(path):
                with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                    text = f.read().replace('\n', ' ')
                    data.append(f"{label} {text}")

        return data

    def read_csv(self, csv_path: str, label_column_index: int, text_column_index: int) -> list:
        """
        reads a CSV file and returns data in Flair-compatible format.
        args:
            csv_path (str): Path to the CSV file.
            label_column_index (int): Index of the label column.
            text_column_index (int): Index of the text column.
        returns:
            list: List of sentences with labels in Flair format.
        """
        # Read CSV while skipping the header row
        try:
            df = pd.read_csv(csv_path, header=0, encoding="utf-8")
        except Exception as e:
            raise ValueError(f"Error reading CSV file: {e}")

        # Map labels and clean text
        label_mapping = {0: "NEG", 1: "POS"}
        data = []
        for index, row in df.iterrows():
            try:
                # Map label and clean text
                label = label_mapping[int(row[label_column_index])]
                text = str(row[text_column_index]).replace("\n", " ")
                data.append(f"__label__{label} {text}")
            except (KeyError, ValueError):
                raise ValueError(
                    f"Invalid data found in row: {row}. Expected label values: {list(label_mapping.keys())}."
                )
        return data


    def save_data(self, data_list: list, file_name: str):
        """
        saves a list of sentences to a file in the output directory.
        args:
            data_list (list): List of sentences to save.
            file_name (str): Name of the output file.
        """
        output_path = os.path.join(self.output_dir, file_name)
        with open(output_path, 'w', encoding='utf-8') as f:
            for item in data_list:
                f.write(f"{item}\n")

        print(f"Saved data to {output_path}")

    def split_data(self, data: list, dev_size: float = 0.1, random_state: int = 42) -> tuple:
        """
        splits data into training and validation sets.
        args:
            data (list): Full dataset.
            dev_size (float): Proportion of data to use for validation.
            random_state (int): Random seed for reproducibility.
        returns:
            tuple: Training and validation sets.
        """
        train_data, dev_data = train_test_split(data, test_size=dev_size, random_state=random_state)
        return train_data, dev_data

In [3]:
# initialize DataProcessor
data_dir = "../data/aclImdb"

try:
    # ensure output directory exists
    flair_data_folder = "../data/flair_data"
    os.makedirs(flair_data_folder, exist_ok=True)
    
    print(f"Directory ensured and flair path set to: {flair_data_folder}")

except Exception as e:
    print(f"An error occurred while setting up the Flair data folder: {e}")

Directory ensured and flair path set to: ../data/flair_data


In [4]:
processor = DataProcessor(output_dir=flair_data_folder)

# process IMDb dataset
train_data = processor.read_imdb(data_dir, is_train=True)
test_data = processor.read_imdb(data_dir, is_train=False)

# split training data into training and validation sets
train_data, dev_data = processor.split_data(train_data, dev_size=0.1)

# save the datasets
processor.save_data(train_data, 'train.txt')
processor.save_data(dev_data, 'dev.txt')
processor.save_data(test_data, 'test.txt')

Saved data to ../data/flair_data/train.txt
Saved data to ../data/flair_data/dev.txt
Saved data to ../data/flair_data/test.txt


Creating corpus for training

In [6]:
corpus_folder = Path(flair_data_folder)

# create training corpus
training_corpus = ClassificationCorpus(
    corpus_folder,
    train_file='train.txt',
    dev_file='dev.txt',
    test_file='test.txt',
    label_type='sentiment'
)

# print statistics
print(f"Number of training sentences: {len(training_corpus.train)}")
print(f"Number of validation sentences: {len(training_corpus.dev)}")
print(f"Number of test sentences: {len(training_corpus.test)}")

2024-12-11 21:18:33,691 Reading data from ../data/flair_data
2024-12-11 21:18:33,693 Train: ../data/flair_data/train.txt
2024-12-11 21:18:33,693 Dev: ../data/flair_data/dev.txt
2024-12-11 21:18:33,694 Test: ../data/flair_data/test.txt
2024-12-11 21:18:34,189 Initialized corpus ../data/flair_data (label type name is 'sentiment')
Number of training sentences: 22500
Number of validation sentences: 2500
Number of test sentences: 25000


Create label dictionary

In [8]:
label_dict = training_corpus.make_label_dictionary(label_type='sentiment')
print(label_dict)

2024-12-11 21:19:43,757 Computing label dictionary. Progress:


0it [00:00, ?it/s]
22500it [00:39, 571.64it/s]

2024-12-11 21:20:23,183 Dictionary created for label 'sentiment' with 2 values: POS (seen 11298 times), NEG (seen 11202 times)
Dictionary with 2 tags: POS, NEG





## Training and Evaluation

Flair’s ModelTrainer not only logs the training process with evaluation against the dev.txt dataset at each epoch but also identifies the best-performing model based on the F1 score. At the end of all epochs, this best model is automatically evaluated against the test.txt dataset specified in the ClassificationCorpus, providing validation for the training results.

However, the evaluate_model function was created to allow evaluation of the best model against any input dataset, especially in scenarios where training is terminated early due to extended runtime or observed performance plateau. Additionally, this function was employed to evaluate the test_data_movie.csv dataset provided later on, ensuring flexibility in validating the model on unseen or supplementary datasets.

### Stacked Embeddings: Flair and Distilbert-base-uncased

In [None]:
# initialise different embeddings
flair_forward_embedding = FlairEmbeddings('news-forward')
flair_backward_embedding = FlairEmbeddings('news-backward')
distilbert_transformer_word_embeddings = TransformerWordEmbeddings('distilbert-base-uncased')

# list of embeddings
flair_distilbert_embeddings = [
    flair_forward_embedding,
    flair_backward_embedding,
    distilbert_transformer_word_embeddings,
]

# create document embeddings from stacked embeddings
flair_distilbert_document_embeddings = DocumentRNNEmbeddings(
    embeddings=flair_distilbert_embeddings,
    hidden_size=256,
    reproject_words=True,
    reproject_words_dimension=256,
)

In [None]:
# initialise text classifier model
flair_distilbert_classifier = TextClassifier(
    flair_distilbert_document_embeddings,
    label_dictionary=label_dict,
    label_type='sentiment'
)

In [None]:
# Determine embeddings_storage_mode based on GPU availability
embeddings_storage_mode = 'gpu' if torch.cuda.is_available() else 'none'

# initialise the trainer
flair_distilbert_trainer = ModelTrainer(flair_distilbert_classifier, training_corpus)

# fine-tune the model
flair_distilbert_trainer.fine_tune(
    base_path='../flair_models/flair_distilbert_model',     # directory to save the model and logs
    learning_rate=5e-5,                                     
    mini_batch_size=16,                                     
    max_epochs=10,                                         
    embeddings_storage_mode=embeddings_storage_mode,     
    optimizer=AdamW,                                        
    save_final_model=True,                                  # save the final model
    save_model_each_k_epochs=1,                             # save model checkpoint every epoch
    create_file_logs=True,                                  # save logs to a file
    create_loss_file=True,                                  # save loss values to a file
)

### Stacked Embeddings: Flair and RoBERTa-large

In [None]:
# initialise different embeddings
flair_forward_embedding = FlairEmbeddings('news-forward')
flair_backward_embedding = FlairEmbeddings('news-backward')
roberta_transformer_word_embeddings = TransformerWordEmbeddings('roberta-large')

# list of embeddings
flair_roberta_embeddings = [
    flair_forward_embedding,
    flair_backward_embedding,
    roberta_transformer_word_embeddings,
]

# create document embeddings from stacked embeddings
flair_roberta_document_embeddings = DocumentRNNEmbeddings(
    embeddings=flair_roberta_embeddings,
    hidden_size=256,
    reproject_words=True,
    reproject_words_dimension=256,
)

In [None]:
# initialise text classifier model
flair_roberta_classifier = TextClassifier(
    flair_roberta_document_embeddings,
    label_dictionary=label_dict,
    label_type='sentiment'
)

In [None]:
# Determine embeddings_storage_mode based on GPU availability
embeddings_storage_mode = 'gpu' if torch.cuda.is_available() else 'none'

# initialise the trainer
flair_roberta_trainer = ModelTrainer(flair_roberta_classifier, training_corpus)

# fine-tune the model
flair_roberta_trainer.fine_tune(
    base_path='../flair_models/flair_roberta_model',        # directory to save the model and logs
    learning_rate=5e-5,                                     
    mini_batch_size=2,                                      
    max_epochs=5,                                           
    embeddings_storage_mode=embeddings_storage_mode,     
    optimizer=AdamW,                                        
    save_final_model=True,                                  # save the final model
    save_model_each_k_epochs=1,                             # save model checkpoint every epoch
    create_file_logs=True,                                  # save logs to a file
    create_loss_file=True,                                  # save loss values to a file
)

### Flair embeddings

In [None]:
# initialise different embeddings
flair_forward_embedding = FlairEmbeddings("news-forward")
flair_backward_embedding = FlairEmbeddings("news-backward")

# list of embeddings
flair_embeddings = [
    flair_forward_embedding,
    flair_backward_embedding,
]

# create document embeddings from stacked embeddings
flair_document_embeddings = DocumentRNNEmbeddings(
    embeddings=flair_embeddings,
    hidden_size=256,
    reproject_words=True,
    reproject_words_dimension=256,
)

In [None]:
# initialise text classifier model
flair_classifier = TextClassifier(
    embeddings=flair_document_embeddings,
    label_dictionary=label_dict,
    label_type="sentiment",
)

In [None]:
# Determine embeddings_storage_mode based on GPU availability
embeddings_storage_mode = 'gpu' if torch.cuda.is_available() else 'none'

# initialise the trainer
flair_trainer = ModelTrainer(flair_classifier, training_corpus)

# fine-tune the model
flair_trainer.fine_tune(
    base_path='../flair_models/flair_model',        # directory to save the model and logs
    learning_rate=5e-5,                             
    mini_batch_size=8,                              
    max_epochs=5,                                   
    embeddings_storage_mode=embeddings_storage_mode,     
    optimizer=AdamW,                              
    save_final_model=True,                          # save the final model
    save_model_each_k_epochs=1,                     # save model checkpoint every epoch
    create_file_logs=True,                          # save logs to a file
    create_loss_file=True,                          # save loss values to a file
)

### Further exploration: RoBERTa

While our team initially explored RoBERTa fine-tuning without incorporating Flair, we aim to further experiment and compare the results of fine-tuning RoBERTa both with and without the Flair framework. This comparison seeks to evaluate the impact of Flair’s architecture, including its ability to leverage additional embeddings and optional RNN layers, on the overall performance and computational efficiency of RoBERTa-based models.

In [None]:
# initialize transformer embeddings
roberta_document_transformer_embedding = TransformerDocumentEmbeddings(
    model="roberta-base",  
    fine_tune=True,       
    layers="-1", 
)

In [None]:
# Update classifier with transformer embedding
roberta_document_transformer_classifier = TextClassifier(
    embeddings=roberta_document_transformer_embedding, 
    label_dictionary=label_dict, 
    label_type="sentiment").to(flair.device)

In [None]:
# Determine embeddings_storage_mode based on GPU availability
embeddings_storage_mode = 'gpu' if torch.cuda.is_available() else 'none'

# initialise the trainer
roberta_document_transformer_trainer = ModelTrainer(roberta_document_transformer_classifier, training_corpus)

# fine-tune the model
roberta_document_transformer_trainer.fine_tune(
    base_path='../flair_models/roberta_model',      # directory to save the model and logs
    learning_rate=5e-5,                             
    mini_batch_size=8,                              
    max_epochs=10,                                  
    embeddings_storage_mode=embeddings_storage_mode,     
    optimizer=AdamW,                                
    save_final_model=True,                          # save the final model
    save_model_each_k_epochs=1,                     # save model checkpoint every epoch
    create_file_logs=True,                          # save logs to a file
    create_loss_file=True,                          # save loss values to a file
)

## Evaluation

In [None]:
def evaluate_model(classifier, test_dataset, output_csv_path:str="../src/outputs/predictions.csv") -> dict:
    """
    evaluate a Flair classifier on a given test dataset, with verification and debugging steps.
    
    args:
        classifier (TextClassifier): The trained Flair classifier.
        test_dataset (Dataset): The test dataset.
        output_csv_path (str, optional): Path to save the CSV file containing true and predicted labels for each sentence.
    
    returns:
        dict: Dictionary containing evaluation metrics namely accuracy, precision, recall, and F1 score.
    """
    true_labels = []
    predicted_labels = []
    sentences = []

    # check device compatibility
    classifier.to(flair.device)
    test_dataset.to(flair.device)
    
    # label mapping
    label_mapping = {"NEG": 0, "POS": 1}

    # iterate over the test dataset with tqdm progress bar
    for sentence in tqdm.tqdm(test_dataset, desc="Evaluating", leave=True):
        sentences.append(sentence.to_plain_string())

        # true label
        true_label = sentence.get_label("sentiment").value
        if true_label not in label_mapping:
            print(f"Skipping sentence with unexpected true label: {true_label}")
            continue
        true_labels.append(true_label)

        # predicted label
        classifier.predict(sentence)
        predicted_label = sentence.labels[0].value
        if predicted_label not in label_mapping:
            print(f"Skipping sentence with unexpected predicted label: {predicted_label}")
            continue
        predicted_labels.append(predicted_label)

    # map labels to numeric values for sklearn
    try:
        true_labels_mapped = [label_mapping[label] for label in true_labels]
        predicted_labels_mapped = [label_mapping[label] for label in predicted_labels]
    except KeyError as e:
        print(f"Label mapping error: {e}. Ensure all labels are in {label_mapping}.")
        return {}

    # write predictions to a CSV file
    with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Sentence", "Actual", "Predicted"])  # Header
        for sentence, true_label, predicted_label in zip(sentences, true_labels, predicted_labels):
            writer.writerow([sentence, true_label, predicted_label])
    print(f"Predictions saved to {output_csv_path}")

    accuracy = accuracy_score(true_labels_mapped, predicted_labels_mapped)
    precision = precision_score(true_labels_mapped, predicted_labels_mapped, pos_label=1, zero_division=0)
    recall = recall_score(true_labels_mapped, predicted_labels_mapped, pos_label=1, zero_division=0)
    f1 = f1_score(true_labels_mapped, predicted_labels_mapped, pos_label=1, zero_division=0)

    classification_rep = classification_report(
        true_labels_mapped,
        predicted_labels_mapped,
        target_names=["NEG", "POS"]
    )

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_rep)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "classification_report": classification_rep,
    }

### Optional: Evaluation on test.txt

In [None]:
# path to the best model
flair_distilbert_model_path = "../flair_models/flair_distilbert_model/best-model.pt"

# load the trained model
trained_flair_distilbert_classifier = TextClassifier.load(flair_distilbert_model_path)

# evaluate the model
flair_distilbert_results = evaluate_model(trained_flair_distilbert_classifier, training_corpus.test, output_csv_path="../src/outputs/flair_distilbert_model_test.csv")

In [None]:
# path to the best model
flair_roberta_model_path = "../flair_models/flair_roberta_model/best-model.pt"

# load the trained model
trained_flair_roberta_classifier = TextClassifier.load(flair_roberta_model_path)

# evaluate the model
flair_roberta_results = evaluate_model(trained_flair_roberta_classifier, training_corpus.test, output_csv_path="../src/outputs/flair_roberta_model_test.csv")

In [None]:
# path to the best model
flair_model_path = "../flair_models/flair_model/best-model.pt"

# load the trained model
trained_flair_classifier = TextClassifier.load(flair_model_path)

# evaluate the model
flair_results = evaluate_model(trained_flair_classifier, training_corpus.test, output_csv_path="../src/outputs/flair_model_test.csv")

In [None]:
# path to the best model
roberta_model_path = "../flair_models/roberta_model/best-model.pt"

# load the trained model
trained_roberta_classifier = TextClassifier.load(roberta_model_path)

# evaluate the model
roberta_results = evaluate_model(trained_roberta_classifier, training_corpus.test, output_csv_path="../src/outputs/roberta_model_test.csv")

### Evaluation with new test data

In [None]:
processor = DataProcessor(output_dir=flair_data_folder)

In [9]:
# path to the CSV file
csv_path = "../data/test_data_movie.csv"

# process CSV for evaluation
evaluation_data = processor.read_csv(csv_path, label_column_index=1, text_column_index=0)

# save evaluation dataset
processor.save_data(evaluation_data, 'evaluation.txt')

Saved data to ../data/flair_data/evaluation.txt


In [10]:
# create evaluation corpus
evaluation_corpus = ClassificationCorpus(
    corpus_folder,
    test_file='evaluation.txt',
    label_type='sentiment'
)

# double check statistics
print(f"Number of evaluation sentences: {len(evaluation_corpus.test)}")

2024-12-11 22:55:48,800 Reading data from ../data/flair_data
2024-12-11 22:55:48,801 Train: ../data/flair_data/train_small.txt
2024-12-11 22:55:48,802 Dev: ../data/flair_data/dev_small.txt
2024-12-11 22:55:48,803 Test: ../data/flair_data/evaluation.txt
2024-12-11 22:55:49,161 Initialized corpus ../data/flair_data (label type name is 'sentiment')
Number of evaluation sentences: 40000


In [None]:
# path to the best model
flair_distilbert_model_path = "../flair_models/flair_distilbert_model/best-model.pt"

# load the trained model
trained_flair_distilbert_classifier = TextClassifier.load(flair_distilbert_model_path)

# evaluate the model
flair_distilbert_evaluation_results = evaluate_model(trained_flair_distilbert_classifier, evaluation_corpus.test, output_csv_path="../src/outputs/flair_distilbert_model_evaluation.csv")

In [None]:
# path to the best model
flair_roberta_model_path = "../flair_models/flair_roberta_model/best-model.pt"

# load the trained model
trained_flair_roberta_classifier = TextClassifier.load(flair_roberta_model_path)

# evaluate the model
flair_roberta_evaluation_results = evaluate_model(trained_flair_roberta_classifier, evaluation_corpus.test, output_csv_path="../src/outputs/flair_roberta_model_evaluation.csv")

In [None]:
# path to the best model
flair_model_path = "../flair_models/flair_model/best-model.pt"

# load the trained model
trained_flair_classifier = TextClassifier.load(flair_model_path)

# evaluate the model
flair_evaluation_results = evaluate_model(trained_flair_classifier, evaluation_corpus.test, output_csv_path="../src/outputs/flair_model_evaluation.csv")

In [None]:
# path to the best model
roberta_model_path = "../flair_models/roberta_model/best-model.pt"

# load the trained model
trained_roberta_classifier = TextClassifier.load(roberta_model_path)

# evaluate the model
roberta_evaluation_results = evaluate_model(trained_roberta_classifier, evaluation_corpus.test, output_csv_path="../src/outputs/roberta_model_evaluation.csv")