In [None]:
!pip install torch pandas scikit-learn transformers sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cach

In [None]:
import torch

if torch.cuda.is_available():
    gpu_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(gpu_id)

    print(f"GPU Name: {gpu_properties.name}")
    print(f"Total Memory (GB): {gpu_properties.total_memory / (1024 ** 3):.2f}")
    print(f"Multiprocessors: {gpu_properties.multi_processor_count}")
else:
    print("No CUDA GPU available.")

GPU Name: Tesla T4
Total Memory (GB): 14.75
Multiprocessors: 40


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Please change this when running locally
base_url = '/content/drive/My Drive/colab-data/'

# Analysis Dataset

In [None]:
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer

In [None]:
df = pd.read_csv(base_url+'train.csv')
# # Fill any missing values with empty strings
df.fillna("", inplace=True)

In [None]:
# df.head(3)

In [None]:
# Check data distribution
# This might affect the loss function choice
df.groupby("label").describe()

Unnamed: 0_level_0,text_1,text_1,text_1,text_1,text_2,text_2,text_2,text_2
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,15000,14260,,10,15000,14274,,17
1,15000,14051,,10,15000,13990,,22


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Check average token
# This affects the max len chosen for bert fine-tuning and GPU usage

# Function to calculate token lengths
def calculate_token_lengths(df, column_name, max_length=512):
    return df[column_name].apply(lambda text: len(tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_length)))

# Add token length columns to the DataFrame
df['text_1_token_length'] = calculate_token_lengths(df, 'text_1')
df['text_2_token_length'] = calculate_token_lengths(df, 'text_2')

# Calculate average token lengths
average_token_length_text_1 = df['text_1_token_length'].mean()
average_token_length_text_2 = df['text_2_token_length'].mean()

# Calculate percentiles for token lengths
percentiles = [75, 95]
text_1_percentiles = np.percentile(df['text_1_token_length'], percentiles)
text_2_percentiles = np.percentile(df['text_2_token_length'], percentiles)

# Print the statistics
print(f"Average token length for 'text_1': {average_token_length_text_1}")
print(f"75th percentile token length for 'text_1': {text_1_percentiles[0]}")
print(f"95th percentile token length for 'text_1': {text_1_percentiles[1]}")

print(f"Average token length for 'text_2': {average_token_length_text_2}")
print(f"75th percentile token length for 'text_2': {text_2_percentiles[0]}")
print(f"95th percentile token length for 'text_2': {text_2_percentiles[1]}")

Average token length for 'text_1': 143.37356666666668
75th percentile token length for 'text_1': 188.0
95th percentile token length for 'text_1': 303.0
Average token length for 'text_2': 142.60423333333333
75th percentile token length for 'text_2': 188.0
95th percentile token length for 'text_2': 299.0


# Model Training

## Setup

In [None]:
import torch
import pandas as pd

from torch import nn
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, models, losses
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import evaluation

In [None]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import InputExample
from torch.utils.data import Dataset

# Define a PyTorch Dataset for authorship verification tasks
class AuthorshipVerificationDataset(Dataset):
    """
    A dataset class that encapsulates a list of examples for authorship verification.
    Each example consists of two texts and a binary label indicating if they are written by the same author.
    """
    def __init__(self, examples):
        """
        Initialize the dataset with a list of InputExamples.

        Args:
            examples (list): A list of InputExample objects.
        """
        self.examples = examples

    def __len__(self):
        """
        Return the number of examples in the dataset.
        """
        return len(self.examples)

    def __getitem__(self, idx):
        """
        Retrieve the InputExample at the specified index in the dataset.

        Args:
            idx (int): The index of the example to retrieve.

        Returns:
            InputExample: The requested example.
        """
        return self.examples[idx]

# Data preparation steps

# Load the dataset from a CSV file
data_file = base_url+'train.csv'
dataframe = pd.read_csv(data_file)

# Fill any missing values with empty strings
dataframe.fillna("", inplace=True)

# Convert the DataFrame into a list of InputExamples
examples = [InputExample(texts=[row['text_1'], row['text_2']], label=float(row['label']))
            for index, row in dataframe.iterrows()]

# 70% into training, and 30% for validation,
train_examples, val_examples = train_test_split(examples, test_size=0.3, random_state=42)
# val_examples, test_examples = train_test_split(test_examples, test_size=0.5, random_state=42)

# Instantiate the Dataset objects for each split
train_dataset = AuthorshipVerificationDataset(train_examples)
val_dataset = AuthorshipVerificationDataset(val_examples)
# test_dataset = AuthorshipVerificationDataset(test_examples)

# Extract texts and labels from validation examples for evaluation purposes
val_texts1 = [example.texts[0] for example in val_examples]
val_texts2 = [example.texts[1] for example in val_examples]
val_labels = [example.label for example in val_examples]

In [None]:
# Define a custom collate function for the DataLoader
def custom_collate_fn(batch):
    """
    Custom collate function to prepare data batches.

    This function is passed to the DataLoader to define how a list of samples
    from the dataset is combined into a batch. It extracts the texts and labels
    from the batch and prepares the tensors that will be fed into the model.

    Args:
        batch (list): A list of tuples with each tuple being
                      (text_1, text_2, label) from the dataset.

    Returns:
        tuple: A tuple containing two lists of texts and a tensor of labels.
    """
    texts1 = [item.texts[0] for item in batch]  # Extract first texts from the batch
    texts2 = [item.texts[1] for item in batch]  # Extract second texts from the batch
    labels = [item.label for item in batch]     # Extract labels from the batch
    return texts1, texts2, torch.tensor(labels, dtype=torch.float)  # Return a tuple of two lists and a tensor

# Create DataLoaders for each dataset split
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=custom_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, collate_fn=custom_collate_fn)
# test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=custom_collate_fn)

## Model Setup and Training

In [None]:
# Initialize roberta-base model from the pre-trained weights.
# This model will serve as the word embedding layer, converting tokens to embeddings.
word_embedding_model = models.Transformer('roberta-base', max_seq_length=256)
# I went with max len of 256, however, if you're not GPU poor as me then try increasing it.
# Longer sequences will be truncated

# Create a pooling layer to aggregate word embeddings into a single sentence embedding.
# By default, this uses mean pooling, which averages the token embeddings.
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

# Initialize a dense layer with ReLU activation to transform the pooled embeddings.
# The output dimension is set to 256, providing a fixed-size dense representation.
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(),
                           out_features=256, activation_function=nn.ReLU())

# Combine the Transformer, Pooling, and Dense layers into a SentenceTransformer model.
# This model will output sentence embeddings with a dimensionality of 256.
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# We tested various loss functions and CosineSimilarityLoss gave best results
# Documentation: https://www.sbert.net/docs/package_reference/losses.html
# 1-CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model)

# 2-ContrastiveLoss
# train_loss = losses.ContrastiveLoss(model)

# 3-OnlineContrastiveLoss (focus on hard examples)
# distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
# margin = 0.5
# train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)

# Recommended epoch in original BERT paper
total_epochs = 4

# Warmup 10% of taining steps
# Learning Rate is the default one in huggingface for bert (between 2e-5 and 5e-5)
total_train_steps = total_epochs * len(train_dataloader)
warmup_s = int(0.1 * total_train_steps)

# Define the evaluator for validation
# Documentation: https://www.sbert.net/docs/package_reference/evaluation.html
embedding_evaluator = evaluation.EmbeddingSimilarityEvaluator(val_texts1, val_texts2, val_labels)
binary_evaluator = evaluation.BinaryClassificationEvaluator(val_texts1, val_texts2, val_labels)

# Create a SequentialEvaluator. This SequentialEvaluator runs all two evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
combined_evaluator = evaluation.SequentialEvaluator([embedding_evaluator, binary_evaluator], main_score_function=lambda scores: scores[-1])

In [None]:
# Start the training process
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=combined_evaluator,
    epochs=total_epochs,
    warmup_steps=warmup_s,
    evaluation_steps=500,
    output_path=base_url+'roberta/output/training_output'
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

## Evaluation

In [None]:
new_data_file = base_url+'dev.csv'
new_dataframe = pd.read_csv(new_data_file)
new_dataframe.fillna("", inplace=True)
new_examples = [InputExample(texts=[row['text_1'], row['text_2']], label=float(row['label']))
                for index, row in new_dataframe.iterrows()]

# Initialize the custom dataset and dataloader for evaluation
new_dataset = AuthorshipVerificationDataset(new_examples)
new_dataloader = DataLoader(new_dataset, batch_size=16, collate_fn=custom_collate_fn)

In [None]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report, accuracy_score

def evaluate_model(model, dataloader):
    """
    Evaluate the SentenceTransformer model on a given dataset.

    Args:
        model (SentenceTransformer): The model to evaluate.
        dataloader (DataLoader): A DataLoader containing the dataset for evaluation.

    Returns:
        None
    """
    model.eval()
    predictions = []
    labels = []

    # Disable gradient calculations for efficiency and safety during inference.
    with torch.no_grad():
        for texts1, texts2, batch_labels in dataloader:
            # Encode the pairs of texts to get their embeddings.
            embeddings1 = model.encode(texts1, convert_to_tensor=True)
            embeddings2 = model.encode(texts2, convert_to_tensor=True)

            # Calculate the cosine similarity between pairs of embeddings.
            cosine_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)

            # Threshold the cosine scores to obtain binary predictions (0 or 1).
            threshold = 0.5
            batch_predictions = (cosine_scores > threshold).type(torch.int)
            predictions.extend(batch_predictions.tolist())
            labels.extend(batch_labels.tolist())

    # Compute classification metrics
    mcc = matthews_corrcoef(labels, predictions)
    roc_auc = roc_auc_score(labels, predictions)

    # Calculate confusion matrix to find TN, FP, FN, TP
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    # Calculate specificity and false positive rate
    specificity = tn / (tn + fp)
    false_positive_rate = fp / (tn + fp)

    # Print all metrics
    print(f"Matthew's Correlation Coefficient: {mcc}")
    print(f"ROC-AUC Score: {roc_auc}")
    print(f"Specificity: {specificity}")
    print(f"False Positive Rate: {false_positive_rate}")

    report = classification_report(labels, predictions, target_names=['Different Authors', 'Same Authors'])
    print("Classification Report:\n", report)

# Evaluate the model
evaluate_model(model, new_dataloader)


Matthew's Correlation Coefficient: 0.6204283713329909
ROC-AUC Score: 0.8097692202306276
Specificity: 0.8377383740381399
False Positive Rate: 0.16226162596186017
Classification Report:
                    precision    recall  f1-score   support

Different Authors       0.79      0.84      0.81      2989
     Same Authors       0.83      0.78      0.80      3011

         accuracy                           0.81      6000
        macro avg       0.81      0.81      0.81      6000
     weighted avg       0.81      0.81      0.81      6000



In [None]:
# Generate and save predictions for the new dataset
model.eval()
predictions = []
with torch.no_grad():
    for texts1, texts2, _ in new_dataloader:
        embeddings1 = model.encode(texts1, convert_to_tensor=True)
        embeddings2 = model.encode(texts2, convert_to_tensor=True)
        cosine_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)
        threshold = 0.5
        batch_predictions = (cosine_scores > threshold).type(torch.int)
        predictions.extend(batch_predictions.tolist())

# Save predictions to a CSV file
predictions_df = pd.DataFrame(predictions, columns=['prediction'])
predictions_df.to_csv(base_url+'dev_predictions_rober.csv', index=False)