<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/Irene/calculate_measure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Measure calculation and sampling the data (700 triplets)

In [15]:
!pip install datasets==2.9.0
!pip install syllapy


Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [24]:
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
import numpy as np
import pandas as pd
import os
import syllapy


In [19]:
def process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=None):
    """
    Process an SNLI dataset for curriculum learning with one measure at a time and proportional sampling.

    Args:
        file_path (str): Path to the CSV file containing the dataset.
        sample_size (int): The number of triplets to sample from the dataset. Default is 700.
        num_bins (int): Number of bins for dividing measure values. Default is 7.
        measures (list): List of functions to calculate complexity measures.

    Returns:
        dict: Dictionary containing two DataFrames (increasing and decreasing) for each measure.
    """

    # Step 1: Load dataset
    data = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(data)} rows")

    data['premise'] = data['premise'].fillna("").astype(str)
    data['hypothesis'] = data['hypothesis'].fillna("").astype(str)

    # Store results
    result = {}

    # Step 2: Process for each measure separately
    for idx, measure_func in enumerate(measures):
        measure_name = f"measure_{idx+1}"

        # Calculate the measure for each row
        data[measure_name] = data.apply(measure_func, axis=1)

        # Step 3: Calculate triplet-level averages for the current measure
        triplet_avg = data.groupby('triplet_nr')[measure_name].mean().reset_index(name='triplet_avg')

        # Step 4: Bin triplets based on the current measure's triplet average
        bin_edges = np.linspace(triplet_avg['triplet_avg'].min(), triplet_avg['triplet_avg'].max(), num_bins + 1)
        triplet_avg['range_bin'] = pd.cut(triplet_avg['triplet_avg'], bins=bin_edges, labels=False, include_lowest=True)

        # Step 5: Calculate how many triplets to sample from each bin
        bin_distribution = triplet_avg['range_bin'].value_counts().sort_index()
        print(f"\nDistribution of triplets across bins for {measure_name}:\n{bin_distribution}")

        triplets_per_bin = (bin_distribution / bin_distribution.sum() * sample_size).astype(int)

        # Adjust sample size if needed
        while triplets_per_bin.sum() < sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmax()] += 1

        while triplets_per_bin.sum() > sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmin()] -= 1

        # Step 6: Sample triplets proportionally from each bin
        sampled_triplets = []
        for bin_id, sample_count in triplets_per_bin.items():
            if sample_count > 0:
                triplets_in_bin = triplet_avg[triplet_avg['range_bin'] == bin_id]['triplet_nr'].values
                sampled_triplet_ids = np.random.choice(triplets_in_bin, size=min(sample_count, len(triplets_in_bin)), replace=False)
                sampled_triplets.append(data[data['triplet_nr'].isin(sampled_triplet_ids)])

        # Combine sampled triplets into a single DataFrame
        final_sample = pd.concat(sampled_triplets).reset_index(drop=True)

        # Step 7: Merge back the triplet averages
        final_sample = final_sample.merge(triplet_avg[['triplet_nr', 'triplet_avg']], on='triplet_nr')

        # Step 8: Sort by increasing and decreasing order
        final_sample_increasing = final_sample.sort_values(by='triplet_avg').reset_index(drop=True)
        final_sample_decreasing = final_sample.sort_values(by='triplet_avg', ascending=False).reset_index(drop=True)

        # Store results for this measure
        result[measure_name] = {
            'increasing': final_sample_increasing,
            'decreasing': final_sample_decreasing
        }

    print(f"Processed {sample_size} triplets and returned ordered DataFrames for each measure.")
    return result




In [8]:


def measure_2(row):
    """Complexity measure: difference in lengths of premise and hypothesis."""
    return abs(len(row['premise']) - len(row['hypothesis']))

def measure_3(row):
    """Complexity measure: number of unique words in the hypothesis."""
    return len(set(row['hypothesis'].split()))



In [26]:
def measure_1(row):
    def count_words(sentence):
        return len(sentence.split())
    def l_row(row):
        word_count = count_words(row['premise']) + count_words(row['hypothesis'])
        return word_count
    def c_row(row):
        """Readability measure using sentence length and word complexity."""
        avg_word_length = (word_length(row['premise']) + word_length(row['hypothesis'])) / 2
        readability = 0.39 * (l_row(row) / 100) + 11.8 * (avg_word_length / 100) - 15.59
        print("c (readability measure) done")
        return readability
    def word_length(sentence):
        words = sentence.split()
        total_length = sum(len(word) for word in words)
        return total_length / len(words) if len(words) > 0 else 0

    li = l_row(row)
    ci = c_row(row)

    combined_measure = li + ci
    return combined_measure

In [25]:
# Usage
file_path = "/content/sampled_snli_10000.csv"
result = process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=[measure_1])

# Step to save increasing and decreasing DataFrames for each measure
for measure_name, dataframes in result.items():
    # Extract increasing and decreasing DataFrames
    increasing_df = dataframes['increasing']
    decreasing_df = dataframes['decreasing']

    # Save to CSV
    increasing_df.to_csv(f"/content/{measure_name}_increasing.csv", index=False)
    decreasing_df.to_csv(f"/content/{measure_name}_decreasing.csv", index=False)

    print(f"Saved {measure_name} DataFrames to CSV:")
    print(f"/content/{measure_name}_increasing.csv")
    print(f"/content/{measure_name}_decreasing.csv")


[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) calculation done
c (readability measure) done
lrc (combined measure) c

#Running models on the sampled data

In [None]:

# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "microsoft/deberta-v3-small"
BATCH_SIZE = 16
DATA_DIRECTORY = 'data/data_samples'

""" Change the name of the DATA_DIRECTORY to the folder where your orderings are:
it expects the data in the following format :

premise hypothesis label
...      ...        ...

The code automatically reverses the order of the data, and calculates the accuracy for this ordering.
It also does the random shuffle for the baseline measurement (and calculates accuracy of course).

The output will be a file of the format:

name_of_metric  baseline    curriculum  anti-curriculum

sentence_length     0.84           0.82            0.80
etc.
"""

# SNLI data needs to be cleaned as it contains -1s as a label
#for k in snli_data:
    #snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )


# Define the column names
columns = ['ordering', 'baseline', 'curriculum', 'anti-curriculum']
# Create an empty DataFrame with the specified columns
out_df = pd.DataFrame(columns=columns)

metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)


model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=4)
model.config.pad_token_id = model.config.eos_token_id

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# validation gets encoded outside the loop
snli_validation = load_dataset("snli", split="validation")
encoded_snli_validation = snli_validation.map(preprocess_function, batched=True, load_from_cache_file=True)

data_files = [f for f in os.listdir(DATA_DIRECTORY) if f.endswith('.csv')]

for file_name in data_files:

    file_path = os.path.join(DATA_DIRECTORY, file_name )
    snli_data = load_dataset("csv", data_files=file_path)

    snli_data = snli_data["train"].select_columns(["premise", "hypothesis", "label"])
    snli_data_reversed = snli_data.select(range(len(snli_data)-1, -1, -1))
    snli_data_random_shuffle = snli_data.shuffle(seed=100)


    new_row = [file_name]

    for data in [snli_data_random_shuffle, snli_data, snli_data_reversed]:

        encoded_snli_data = data.map(preprocess_function, batched=True, load_from_cache_file=True)

        #instantiate the model
        args = TrainingArguments(
        output_dir=MODEL_DIR,             # Directory to save model checkpoints
        evaluation_strategy="steps",      # Evaluates the model at regular intervals (defined by eval_steps)
        eval_steps=200,                   # Number of steps between evaluations
        save_steps=200,                   # Number of steps between saving checkpoints
        logging_steps=100,                # Number of steps between logging metrics
        per_device_train_batch_size=16, # Number of samples per training batch per device
        per_device_eval_batch_size=16,  # Number of samples per evaluation batch per device
        learning_rate=5.1e-05,               # Initial learning rate for the optimizer
        num_train_epochs= 4,               # Total number of training epochs
        weight_decay=0.0074,                # L2 weight regularization to prevent overfitting
        warmup_steps = 211,                 # Number of warmup steps for the learning rate scheduler
        save_total_limit=2,               # Limits the number of saved checkpoints to save disk space
        load_best_model_at_end=True,      # Automatically loads the best checkpoint after training
        metric_for_best_model="accuracy", # Metric used to determine the best model during evaluation
        greater_is_better=True,           # Indicates whether a higher metric value is better
        logging_dir="./logs",             # Directory to save TensorBoard logs
        fp16=True,                        # Enables mixed precision training for faster computation
        lr_scheduler_type= "cosine",       # Uses cosine learning rate decay for smoother transitions

        )
        trainer = Trainer(
            model,
            args,
            train_dataset=encoded_snli_data,
            eval_dataset=encoded_snli_validation,
            # You could use "test" here but it will be cheating then
            # to select the model checkpoint which gets highest score on test
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
            )

        trainer.train()


        # Evaluate the model on validation set
        eval_results = trainer.evaluate()
        new_row.append(eval_results["eval_accuracy"])
        eval_results

    # append the evaluations to the table
    out_df.loc[len(out_df)] = new_row

out_df.to_csv("evaluations.csv", index=False)