<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/main/calculate_measure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Measure calculation and sampling the data (700 triplets)

In [None]:
!pip install -U datasets
!pip install evaluate
!pip install spacy
!python -m spacy download en_core_web_md
!pip install syllapy
#!pip install stanza

In [None]:
!pip install --upgrade pandas

In [1]:
import pandas as pd
#from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
import numpy as np
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util
import spacy
from sentence_transformers import SentenceTransformer, util
import torch
import nltk
import syllapy

nlp = spacy.load("en_core_web_md")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'sentence_transformers'

## Function for sampling based on the measure

In [23]:
def process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=None):
    """
    Process an SNLI dataset for curriculum learning with one measure at a time, proportional sampling, and random baseline.

    Args:
        file_path (str): Path to the CSV file containing the dataset.
        sample_size (int): The number of triplets to sample from the dataset. Default is 700.
        num_bins (int): Number of bins for dividing measure values. Default is 7.
        measures (list): List of functions to calculate complexity measures.

    Returns:
        dict: Dictionary containing increasing, decreasing, and random baseline DataFrames for each measure.
    """

    # Step 1: Load dataset
    data = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(data)} rows")

    data['premise'] = data['premise'].fillna("").astype(str)
    data['hypothesis'] = data['hypothesis'].fillna("").astype(str)

    # Store results
    result = {}

    # Step 2: Process for each measure separately
    for idx, measure_func in enumerate(measures):
        measure_name = measure_func.__name__

        # Calculate the measure for each row
        data[measure_name] = data.apply(measure_func, axis=1)

        # Step 3: Calculate triplet-level averages for the current measure
        triplet_avg = data.groupby('triplet_nr')[measure_name].mean().reset_index(name='triplet_avg')

        # Step 4: Bin triplets based on the current measure's triplet average
        bin_edges = np.linspace(triplet_avg['triplet_avg'].min(), triplet_avg['triplet_avg'].max(), num_bins + 1)
        triplet_avg['range_bin'] = pd.cut(triplet_avg['triplet_avg'], bins=bin_edges, labels=False, include_lowest=True)

        # Step 5: Calculate how many triplets to sample from each bin
        bin_distribution = triplet_avg['range_bin'].value_counts().sort_index()
        print(f"\nDistribution of triplets across bins for {measure_name}:\n{bin_distribution}")

        triplets_per_bin = (bin_distribution / bin_distribution.sum() * sample_size).astype(int)

        # Adjust sample size if needed
        while triplets_per_bin.sum() < sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmax()] += 1

        while triplets_per_bin.sum() > sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmin()] -= 1

        # Step 6: Sample triplets proportionally from each bin
        sampled_triplets = []
        for bin_id, sample_count in triplets_per_bin.items():
            if sample_count > 0:
                triplets_in_bin = triplet_avg[triplet_avg['range_bin'] == bin_id]['triplet_nr'].values
                sampled_triplet_ids = np.random.choice(triplets_in_bin, size=min(sample_count, len(triplets_in_bin)), replace=False)
                sampled_triplets.append(data[data['triplet_nr'].isin(sampled_triplet_ids)])

        # Combine sampled triplets into a single DataFrame
        final_sample = pd.concat(sampled_triplets).reset_index(drop=True)

        # Step 7: Merge back the triplet averages
        final_sample = final_sample.merge(triplet_avg[['triplet_nr', 'triplet_avg']], on='triplet_nr')

        # Step 8: Sort by increasing and decreasing order
        final_sample_increasing = final_sample.sort_values(by='triplet_avg').reset_index(drop=True)
        final_sample_decreasing = final_sample.sort_values(by='triplet_avg', ascending=False).reset_index(drop=True)
        baseline = final_sample.sample(frac=1, random_state=100)


        # Store results for this measure
        result[measure_name] = {
            'increasing': final_sample_increasing,
            'decreasing': final_sample_decreasing,
            'baseline': baseline
        }

    print(f"Processed {sample_size} triplets and returned ordered DataFrames for each measure.")
    return result


## Measure 1


In [18]:

# Load the dataset
file_path = '/content/sampled_snli_10000.csv'  # Update the path if needed
df = pd.read_csv(file_path)

# Calculate sentence lengths
df['premise_length'] = df['premise'].apply(lambda x: len(str(x).split()))
df['hypothesis_length'] = df['hypothesis'].apply(lambda x: len(str(x).split()))

# Find the maximum sentence length between both columns
max_length = max(df['premise_length'].max(), df['hypothesis_length'].max())

print("Maximum sentence length:", max_length)


Maximum sentence length: 78


In [19]:
def measure_1(row):
    """Calculate combined complexity score using syntactic overlap, lexical diversity, and sentence length."""

    # Extract premise and hypothesis from the row
    premise = row['premise']
    hypothesis = row['hypothesis']

    def longest_common_subsequence(X, Y):
        """Calculate the length of the longest common subsequence between two sentences."""
        m, n = len(X), len(Y)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        # Fill DP table
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if X[i - 1] == Y[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
        return dp[m][n]

    def syntactic_overlap(premise, hypothesis):
        """Calculate syntactic overlap using longest common subsequence."""
        premise_words = premise.split()
        hypothesis_words = hypothesis.split()
        lcs_length = longest_common_subsequence(premise_words, hypothesis_words)
        return lcs_length / max(len(premise_words), len(hypothesis_words))

    def lexical_diversity_comparison(premise, hypothesis):
        """Calculate lexical diversity comparison between premise and hypothesis."""
        premise_words = set(premise.split())
        hypothesis_words = set(hypothesis.split())
        shared_unique_words = len(premise_words & hypothesis_words)
        total_words = len(premise.split()) + len(hypothesis.split())
        return shared_unique_words / total_words if total_words > 0 else 0

    # Step 1: Calculate syntactic overlap (higher difference -> more complex)
    syntactic_diff = 1 - syntactic_overlap(premise, hypothesis)

    # Step 2: Calculate lexical diversity comparison (higher diversity -> more complex)
    lexical_diversity_score = 1 - lexical_diversity_comparison(premise, hypothesis)

    # Step 3: Calculate normalized sentence length
    max_length = 78  # Maximum sentence length for normalization
    combined_length = len(premise.split()) + len(hypothesis.split())
    normalized_length = combined_length / max_length

    # Step 4: Combine using weights
    w1, w2, w3 = 0.5, 0.3, 0.2  # Weights for syntactic overlap, lexical diversity, and length
    complexity_score = (w1 * syntactic_diff) + (w2 * lexical_diversity_score) + (w3 * normalized_length)

    return complexity_score

### Syntactic Tree Depth Measure

In [2]:
import spacy

def compute_tree_depth(text):
 
    def get_dependency_depth(token):
        """Recursively find the depth of a token in the dependency tree."""
        if token == token.head:  # Root node (self-referential head)
            return 0
        return 1 + get_dependency_depth(token.head)
    
    doc = nlp(text)

    return max(get_dependency_depth(token) for token in doc) if len(doc) > 0 else 0

def tree_depth_premise(row):
    """
    Compute the tree depth for the premise of a single row.
    """
    return compute_tree_depth(row["premise"])

def tree_depth_hypothesis(row):
    """
    Compute the tree depth for the hypothesis of a single row.
    """
    return compute_tree_depth(row["hypothesis"])

### Sentence Length Measure

In [3]:
def length_calc(text):
    """
    Function to calculate the number of words in a given text.
    """
    return len(text.split())

def sentence_length_premise(row):
    """
    Compute the sentence length for the premise of a single row.
    """
    return length_calc(row["premise"])

def sentence_length_hypothesis(row):
    """
    Compute the sentence length for the hypothesis of a single row.
    """
    return length_calc(row["hypothesis"])

def sentence_length_combined(row):
    """
    Compute the combined length of the premise and hypothesis for a single row.
    """
    return sentence_length_premise(row) + sentence_length_hypothesis(row)


### Flesch-Kincaid Measure

In [5]:
def flesch_kincaid_calc(text):
    """
    function to compute the Flesch-Kincaid Grade Level for a given text.
    """
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)


    total_words = max(len(words), 1)
    total_sentences = max(len(sentences), 1)
    total_syllables = sum(syllapy.count(word) for word in words)

    # Flesch-Kincaid Grade Level formula
    grade_level = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
    return round(grade_level, 2)

def flesch_kincaid_premise(row):
    """
    Compute the Flesch-Kincaid Grade Level for the premise of a single row.
    """
    return flesch_kincaid_calc(row["premise"])

def flesch_kincaid_hypothesis(row):
    """
    Compute the Flesch-Kincaid Grade Level for the hypothesis of a single row.
    """
    return flesch_kincaid_calc(row["hypothesis"])

def flesch_kincaid_combined(row):
    """
    Compute the combined Flesch-Kincaid Grade Level (premise + hypothesis) for a single row.
    """
    return flesch_kincaid_premise(row) + flesch_kincaid_hypothesis(row)

In [24]:
# Usage
file_path = "/content/sampled_snli_10000.csv"
measure_list = [measure_1, tree_depth_premise, tree_depth_hypothesis, sentence_length_premise,
                sentence_length_hypothesis, sentence_length_combined, flesch_kincaid_premise,
                flesch_kincaid_hypothesis, flesch_kincaid_combined
                ]
result = process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=measure_list)

# Step to save increasing, decreasing, and random baseline DataFrames for each measure
for measure_name, dataframes in result.items():
    # Extract increasing, decreasing, and random baseline DataFrames
    increasing_df = dataframes['increasing']
    decreasing_df = dataframes['decreasing']
    random_baseline_df = dataframes['baseline']

    # Save to CSV
    increasing_df.to_csv(f"/content/{measure_name}_increasing.csv", index=False)
    decreasing_df.to_csv(f"/content/{measure_name}_decreasing.csv", index=False)
    random_baseline_df.to_csv(f"/content/{measure_name}_baseline.csv", index=False)

    print(f"Saved {measure_name} DataFrames to CSV:")
    print(f"/content/{measure_name}_increasing.csv")
    print(f"/content/{measure_name}_decreasing.csv")
    print(f"/content/{measure_name}_random_baseline.csv")


Dataset loaded: 30000 rows

Distribution of triplets across bins for measure_1:
range_bin
0     135
1     138
2     699
3    2468
4    4641
5    1910
6       9
Name: count, dtype: int64
Processed 700 triplets and returned ordered DataFrames for each measure.
Saved measure_1 DataFrames to CSV:
/content/measure_1_increasing.csv
/content/measure_1_decreasing.csv
/content/measure_1_random_baseline.csv


#Running models on the sampled data

In [12]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from evaluate import load # Import load instead of load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:

MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "microsoft/deberta-v3-small"
BATCH_SIZE = 16
FILE_TO_TRAIN = "/content/measure_1_baseline.csv"

# Load the evaluation dataset from SNLI
snli = load_dataset("snli")
snli = snli.filter(lambda example: example["label"] >= 0)  # Filter labels directly

train_dataset = load_dataset("csv", data_files=FILE_TO_TRAIN)["train"]

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True)

# Preprocess both training and validation datasets
encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_snli = snli.map(preprocess_function, batched=True, load_from_cache_file=True)

# Training arguments
args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="steps",  # Evaluate periodically during training
    eval_steps=200,  # Frequency of evaluation during training
    save_steps=200,  # Save model checkpoints periodically
    logging_steps=100,  # Log training metrics periodically
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5.1e-05,
    num_train_epochs=4,
    weight_decay=0.0074,
    warmup_steps=211,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    fp16=True,  # Enable mixed precision training
    lr_scheduler_type="cosine"
)

# Initialize model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

# Define compute_metrics function
metric = load('glue', "mnli")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Trainer setup
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_snli["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save final model and tokenizer
trainer.save_model(os.path.join(MODEL_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(MODEL_DIR, "final_model"))

print(f"Training completed. Model and tokenizer saved to: {os.path.join(MODEL_DIR, 'final_model')}")

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
200,0.7396,0.540333,0.81203
400,0.3076,0.518411,0.83987
