<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/Jagoda/calculate_measure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Measure calculation and sampling the data (700 triplets)

In [4]:
!pip install datasets==2.9.0



In [5]:
import pandas as pd
from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
import numpy as np
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util



In [6]:
def process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=None):
    """
    Process an SNLI dataset for curriculum learning with one measure at a time and proportional sampling.

    Args:
        file_path (str): Path to the CSV file containing the dataset.
        sample_size (int): The number of triplets to sample from the dataset. Default is 700.
        num_bins (int): Number of bins for dividing measure values. Default is 7.
        measures (list): List of functions to calculate complexity measures.

    Returns:
        dict: Dictionary containing two DataFrames (increasing and decreasing) for each measure.
    """

    # Step 1: Load dataset
    data = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(data)} rows")

    data['premise'] = data['premise'].fillna("").astype(str)
    data['hypothesis'] = data['hypothesis'].fillna("").astype(str)

    # Store results
    result = {}

    # Step 2: Process for each measure separately
    for idx, measure_func in enumerate(measures):
        measure_name = f"measure_{idx+1}"

        # Calculate the measure for each row
        data[measure_name] = data.apply(measure_func, axis=1)

        # Step 3: Calculate triplet-level averages for the current measure
        triplet_avg = data.groupby('triplet_nr')[measure_name].mean().reset_index(name='triplet_avg')

        # Step 4: Bin triplets based on the current measure's triplet average
        bin_edges = np.linspace(triplet_avg['triplet_avg'].min(), triplet_avg['triplet_avg'].max(), num_bins + 1)
        triplet_avg['range_bin'] = pd.cut(triplet_avg['triplet_avg'], bins=bin_edges, labels=False, include_lowest=True)

        # Step 5: Calculate how many triplets to sample from each bin
        bin_distribution = triplet_avg['range_bin'].value_counts().sort_index()
        print(f"\nDistribution of triplets across bins for {measure_name}:\n{bin_distribution}")

        triplets_per_bin = (bin_distribution / bin_distribution.sum() * sample_size).astype(int)

        # Adjust sample size if needed
        while triplets_per_bin.sum() < sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmax()] += 1

        while triplets_per_bin.sum() > sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmin()] -= 1

        # Step 6: Sample triplets proportionally from each bin
        sampled_triplets = []
        for bin_id, sample_count in triplets_per_bin.items():
            if sample_count > 0:
                triplets_in_bin = triplet_avg[triplet_avg['range_bin'] == bin_id]['triplet_nr'].values
                sampled_triplet_ids = np.random.choice(triplets_in_bin, size=min(sample_count, len(triplets_in_bin)), replace=False)
                sampled_triplets.append(data[data['triplet_nr'].isin(sampled_triplet_ids)])

        # Combine sampled triplets into a single DataFrame
        final_sample = pd.concat(sampled_triplets).reset_index(drop=True)

        # Step 7: Merge back the triplet averages
        final_sample = final_sample.merge(triplet_avg[['triplet_nr', 'triplet_avg']], on='triplet_nr')

        # Step 8: Sort by increasing and decreasing order
        final_sample_increasing = final_sample.sort_values(by='triplet_avg').reset_index(drop=True)
        final_sample_decreasing = final_sample.sort_values(by='triplet_avg', ascending=False).reset_index(drop=True)

        # Store results for this measure
        result[measure_name] = {
            'increasing': final_sample_increasing,
            'decreasing': final_sample_decreasing
        }

    print(f"Processed {sample_size} triplets and returned ordered DataFrames for each measure.")
    return result




In [7]:
# Load models globally to avoid reloading them inside the function
model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load('en_core_web_sm')

def measure_1(row, min_divergence=None, max_divergence=None):
    """Calculate normalized semantic divergence using SRL (with all roles) and contextual embeddings."""

    def extract_roles(sentence):
        """Extract verbs, subjects, objects, locations, and time expressions."""
        doc = nlp(sentence)

        # Verbs (actions)
        verbs = [token.text for token in doc if token.pos_ == 'VERB']

        # Subjects (agents)
        subjects = [chunk.text for chunk in doc.noun_chunks if chunk.root.dep_ == 'nsubj']

        # Objects (patients)
        objects = [chunk.text for chunk in doc.noun_chunks if chunk.root.dep_ == 'dobj']

        # Locations (using Named Entity Recognition)
        locations = [ent.text for ent in doc.ents if ent.label_ in {'GPE', 'LOC', 'FACILITY'}]

        # Time expressions (adverbs of time or temporal phrases)
        time_expressions = [token.text for token in doc if token.dep_ == 'npadvmod' or token.ent_type_ == 'DATE']

        # Concatenate all roles into a single context string
        return " ".join(verbs + subjects + objects + locations + time_expressions)

    def srl_similarity(premise, hypothesis):
        """Calculate semantic similarity using roles and contextual embeddings."""

        # Extract roles for premise and hypothesis
        context1 = extract_roles(premise)
        context2 = extract_roles(hypothesis)

        # Compute embeddings
        embedding1 = model.encode(context1, convert_to_tensor=True)
        embedding2 = model.encode(context2, convert_to_tensor=True)

        # Compute cosine similarity
        similarity_score = util.pytorch_cos_sim(embedding1, embedding2).item()

        # Convert similarity to divergence
        divergence = 1 - similarity_score  # Higher divergence → higher difficulty
        return divergence

    # Calculate semantic divergence for the current row
    divergence = srl_similarity(row['premise'], row['hypothesis'])

    # Normalize divergence if min and max values are provided
    if min_divergence is not None and max_divergence is not None:
        divergence = (divergence - min_divergence) / (max_divergence - min_divergence)

    # Assign a difficulty score (can be modified if needed)
    difficulty_score = divergence  # Higher divergence means higher difficulty

    return difficulty_score


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Usage
file_path = "/content/sampled_snli_10000.csv"
result = process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=[measure_1])

# Step to save increasing and decreasing DataFrames for each measure
for measure_name, dataframes in result.items():
    # Extract increasing and decreasing DataFrames
    increasing_df = dataframes['increasing']
    decreasing_df = dataframes['decreasing']

    # Save to CSV
    increasing_df.to_csv(f"/content/{measure_name}_increasing.csv", index=False)
    decreasing_df.to_csv(f"/content/{measure_name}_decreasing.csv", index=False)

    print(f"Saved {measure_name} DataFrames to CSV:")
    print(f"/content/{measure_name}_increasing.csv")
    print(f"/content/{measure_name}_decreasing.csv")


Dataset loaded: 30000 rows


#Running models on the sampled data

In [None]:

# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "microsoft/deberta-v3-small"
BATCH_SIZE = 16
DATA_DIRECTORY = 'data/data_samples'

""" Change the name of the DATA_DIRECTORY to the folder where your orderings are:
it expects the data in the following format :

premise hypothesis label
...      ...        ...

The code automatically reverses the order of the data, and calculates the accuracy for this ordering.
It also does the random shuffle for the baseline measurement (and calculates accuracy of course).

The output will be a file of the format:

name_of_metric  baseline    curriculum  anti-curriculum

sentence_length     0.84           0.82            0.80
etc.
"""

# SNLI data needs to be cleaned as it contains -1s as a label
#for k in snli_data:
    #snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )


# Define the column names
columns = ['ordering', 'baseline', 'curriculum', 'anti-curriculum']
# Create an empty DataFrame with the specified columns
out_df = pd.DataFrame(columns=columns)

metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)


model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=4)
model.config.pad_token_id = model.config.eos_token_id

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# validation gets encoded outside the loop
snli_validation = load_dataset("snli", split="validation")
encoded_snli_validation = snli_validation.map(preprocess_function, batched=True, load_from_cache_file=True)

data_files = [f for f in os.listdir(DATA_DIRECTORY) if f.endswith('.csv')]

for file_name in data_files:

    file_path = os.path.join(DATA_DIRECTORY, file_name )
    snli_data = load_dataset("csv", data_files=file_path)

    snli_data = snli_data["train"].select_columns(["premise", "hypothesis", "label"])
    snli_data_reversed = snli_data.select(range(len(snli_data)-1, -1, -1))
    snli_data_random_shuffle = snli_data.shuffle(seed=100)


    new_row = [file_name]

    for data in [snli_data_random_shuffle, snli_data, snli_data_reversed]:

        encoded_snli_data = data.map(preprocess_function, batched=True, load_from_cache_file=True)

        #instantiate the model
        args = TrainingArguments(
        output_dir=MODEL_DIR,             # Directory to save model checkpoints
        evaluation_strategy="steps",      # Evaluates the model at regular intervals (defined by eval_steps)
        eval_steps=200,                   # Number of steps between evaluations
        save_steps=200,                   # Number of steps between saving checkpoints
        logging_steps=100,                # Number of steps between logging metrics
        per_device_train_batch_size=16, # Number of samples per training batch per device
        per_device_eval_batch_size=16,  # Number of samples per evaluation batch per device
        learning_rate=5.1e-05,               # Initial learning rate for the optimizer
        num_train_epochs= 4,               # Total number of training epochs
        weight_decay=0.0074,                # L2 weight regularization to prevent overfitting
        warmup_steps = 211,                 # Number of warmup steps for the learning rate scheduler
        save_total_limit=2,               # Limits the number of saved checkpoints to save disk space
        load_best_model_at_end=True,      # Automatically loads the best checkpoint after training
        metric_for_best_model="accuracy", # Metric used to determine the best model during evaluation
        greater_is_better=True,           # Indicates whether a higher metric value is better
        logging_dir="./logs",             # Directory to save TensorBoard logs
        fp16=True,                        # Enables mixed precision training for faster computation
        lr_scheduler_type= "cosine",       # Uses cosine learning rate decay for smoother transitions

        )
        trainer = Trainer(
            model,
            args,
            train_dataset=encoded_snli_data,
            eval_dataset=encoded_snli_validation,
            # You could use "test" here but it will be cheating then
            # to select the model checkpoint which gets highest score on test
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
            )

        trainer.train()


        # Evaluate the model on validation set
        eval_results = trainer.evaluate()
        new_row.append(eval_results["eval_accuracy"])
        eval_results

    # append the evaluations to the table
    out_df.loc[len(out_df)] = new_row

out_df.to_csv("evaluations.csv", index=False)