In [None]:
!pip install torch pandas scikit-learn transformers sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cach

# Setup and Model Loading

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Please change this when running locally
base_url = '/content/drive/My Drive/colab-data/'

In [None]:
# Define a PyTorch Dataset for authorship verification tasks
class AuthorshipVerificationDataset(Dataset):
    """
    A dataset class that encapsulates a list of examples for authorship verification.
    Each example consists of two texts and a binary label indicating if they are written by the same author.
    """
    def __init__(self, examples):
        """
        Initialize the dataset with a list of InputExamples.

        Args:
            examples (list): A list of InputExample objects.
        """
        self.examples = examples

    def __len__(self):
        """
        Return the number of examples in the dataset.
        """
        return len(self.examples)

    def __getitem__(self, idx):
        """
        Retrieve the InputExample at the specified index in the dataset.

        Args:
            idx (int): The index of the example to retrieve.

        Returns:
            InputExample: The requested example.
        """
        return self.examples[idx]


def custom_collate_fn(batch):
    """
    Custom collation function for DataLoader that prepares batches for processing.
    This function handles text pairs, optionally with labels if present, suitable for
    tasks such as text similarity or classification.

    Args:
        batch (list): A list of data instances, where each instance is expected to
                      have 'texts' (a list containing two pieces of text) and optionally 'label'.

    Returns:
        tuple: If labels are present, returns two lists of texts and a tensor of labels.
               Otherwise, returns only the two lists of texts.
    """
    # Extract the first and second texts from each item in the batch
    texts1 = [item.texts[0] for item in batch]
    texts2 = [item.texts[1] for item in batch]

    # Check if the first item has a label attribute
    if hasattr(batch[0], 'label'):
        labels = [item.label for item in batch]
        return texts1, texts2, torch.tensor(labels, dtype=torch.float)
    else:
        return texts1, texts2


In [None]:
# Load the saved model
model_path = base_url+'roberta/output/training_output'
model = SentenceTransformer(model_path)

# Evaluation

In [None]:
def load_dataset(filepath):
    """
    Load a dataset from a specified file path and prepare it for model training or evaluation.
    Assumes the file is in CSV format and contains text pair columns 'text_1' and 'text_2',
    along with an optional 'label' column for supervised learning.

    Args:
        filepath (str): The path to the CSV file containing the dataset.

    Returns:
        AuthorshipVerificationDataset: A dataset object containing preprocessed examples.
    """
    dataframe = pd.read_csv(filepath)
    dataframe.fillna("", inplace=True)
    examples = [InputExample(texts=[row['text_1'], row['text_2']], label=float(row.get('label', 0)))
                for index, row in dataframe.iterrows()]
    return AuthorshipVerificationDataset(examples)

def predict_and_save(model, dataloader, filepath):
    """
    Predict using a pre-trained SentenceTransformer model and save the predictions to a CSV file.
    The function calculates cosine similarities between embeddings of text pairs and applies
    a threshold to determine the binary classification outcomes.

    Args:
        model (SentenceTransformer): The pre-trained model to use for predictions.
        dataloader (DataLoader): The DataLoader providing batches of data for prediction.
        filepath (str): The path where the prediction results CSV will be saved.
    """
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            if len(batch) == 3:
                texts1, texts2, _ = batch
            else:
                texts1, texts2 = batch

            # Encode the text pairs to get their embeddings
            embeddings1 = model.encode(texts1, convert_to_tensor=True)
            embeddings2 = model.encode(texts2, convert_to_tensor=True)
            # Calculate cosine similarities between pairs of embeddings
            cosine_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)
            # Apply a threshold to determine binary outcomes
            threshold = 0.5
            batch_predictions = (cosine_scores > threshold).type(torch.int)
            predictions.extend(batch_predictions.tolist())

    # Save the predictions to a CSV file
    predictions_df = pd.DataFrame(predictions, columns=['prediction'])
    predictions_df.to_csv(filepath, index=False)


## Test data

In [None]:
# Predict test.csv
test_dataset = load_dataset(base_url+'test.csv')
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=custom_collate_fn)
predict_and_save(model, test_dataloader, 'test_predictions.csv')

## Dev data
To confirm the model was loaded correctly

In [None]:
def evaluate_model(model, dataloader):
    """
    Evaluate the SentenceTransformer model on a given dataset.

    Args:
        model (SentenceTransformer): The model to evaluate.
        dataloader (DataLoader): A DataLoader containing the dataset for evaluation.

    Returns:
        None
    """
    model.eval()
    predictions = []
    labels = []

    # Disable gradient calculations for efficiency and safety during inference.
    with torch.no_grad():
        for texts1, texts2, batch_labels in dataloader:
            # Encode the pairs of texts to get their embeddings.
            embeddings1 = model.encode(texts1, convert_to_tensor=True)
            embeddings2 = model.encode(texts2, convert_to_tensor=True)

            # Calculate the cosine similarity between pairs of embeddings.
            cosine_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)

            # Threshold the cosine scores to obtain binary predictions (0 or 1).
            threshold = 0.5
            batch_predictions = (cosine_scores > threshold).type(torch.int)
            predictions.extend(batch_predictions.tolist())
            labels.extend(batch_labels.tolist())

    # Compute classification metrics
    mcc = matthews_corrcoef(labels, predictions)
    roc_auc = roc_auc_score(labels, predictions)

    # Calculate confusion matrix to find TN, FP, FN, TP
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    # Calculate specificity and false positive rate
    specificity = tn / (tn + fp)
    false_positive_rate = fp / (tn + fp)

    # Print all metrics
    print(f"Matthew's Correlation Coefficient: {mcc}")
    print(f"ROC-AUC Score: {roc_auc}")
    print(f"Specificity: {specificity}")
    print(f"False Positive Rate: {false_positive_rate}")

    report = classification_report(labels, predictions, target_names=['Different Authors', 'Same Authors'])
    print("Classification Report:\n", report)


In [None]:
# Evaluate dev.csv with labels
dev_dataset = load_dataset(base_url+'dev.csv')
dev_dataloader = DataLoader(dev_dataset, batch_size=16, collate_fn=custom_collate_fn)
evaluate_model(model, dev_dataloader)
# predict_and_save(model, dev_dataloader, 'dev_predictions.csv')

Matthew's Correlation Coefficient: 0.6204283713329909
ROC-AUC Score: 0.8097692202306276
Specificity: 0.8377383740381399
False Positive Rate: 0.16226162596186017
Classification Report:
                    precision    recall  f1-score   support

Different Authors       0.79      0.84      0.81      2989
     Same Authors       0.83      0.78      0.80      3011

         accuracy                           0.81      6000
        macro avg       0.81      0.81      0.81      6000
     weighted avg       0.81      0.81      0.81      6000

