<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/main/calculate_measure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np

In [1]:
def process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=None):
    """
    Process an SNLI dataset for curriculum learning with one measure at a time and proportional sampling.

    Args:
        file_path (str): Path to the CSV file containing the dataset.
        sample_size (int): The number of triplets to sample from the dataset. Default is 700.
        num_bins (int): Number of bins for dividing measure values. Default is 7.
        measures (list): List of functions to calculate complexity measures.

    Returns:
        dict: Dictionary containing two DataFrames (increasing and decreasing) for each measure.
    """

    # Step 1: Load dataset
    data = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(data)} rows")

    data['premise'] = data['premise'].fillna("").astype(str)
    data['hypothesis'] = data['hypothesis'].fillna("").astype(str)

    # Store results
    result = {}

    # Step 2: Process for each measure separately
    for idx, measure_func in enumerate(measures):
        measure_name = f"measure_{idx+1}"

        # Calculate the measure for each row
        data[measure_name] = data.apply(measure_func, axis=1)

        # Step 3: Calculate triplet-level averages for the current measure
        triplet_avg = data.groupby('triplet_nr')[measure_name].mean().reset_index(name='triplet_avg')

        # Step 4: Bin triplets based on the current measure's triplet average
        bin_edges = np.linspace(triplet_avg['triplet_avg'].min(), triplet_avg['triplet_avg'].max(), num_bins + 1)
        triplet_avg['range_bin'] = pd.cut(triplet_avg['triplet_avg'], bins=bin_edges, labels=False, include_lowest=True)

        # Step 5: Calculate how many triplets to sample from each bin
        bin_distribution = triplet_avg['range_bin'].value_counts().sort_index()
        print(f"\nDistribution of triplets across bins for {measure_name}:\n{bin_distribution}")

        triplets_per_bin = (bin_distribution / bin_distribution.sum() * sample_size).astype(int)

        # Adjust sample size if needed
        while triplets_per_bin.sum() < sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmax()] += 1

        while triplets_per_bin.sum() > sample_size:
            residuals = (bin_distribution / bin_distribution.sum() * sample_size) - triplets_per_bin
            triplets_per_bin[residuals.idxmin()] -= 1

        # Step 6: Sample triplets proportionally from each bin
        sampled_triplets = []
        for bin_id, sample_count in triplets_per_bin.items():
            if sample_count > 0:
                triplets_in_bin = triplet_avg[triplet_avg['range_bin'] == bin_id]['triplet_nr'].values
                sampled_triplet_ids = np.random.choice(triplets_in_bin, size=min(sample_count, len(triplets_in_bin)), replace=False)
                sampled_triplets.append(data[data['triplet_nr'].isin(sampled_triplet_ids)])

        # Combine sampled triplets into a single DataFrame
        final_sample = pd.concat(sampled_triplets).reset_index(drop=True)

        # Step 7: Merge back the triplet averages
        final_sample = final_sample.merge(triplet_avg[['triplet_nr', 'triplet_avg']], on='triplet_nr')

        # Step 8: Sort by increasing and decreasing order
        final_sample_increasing = final_sample.sort_values(by='triplet_avg').reset_index(drop=True)
        final_sample_decreasing = final_sample.sort_values(by='triplet_avg', ascending=False).reset_index(drop=True)

        # Store results for this measure
        result[measure_name] = {
            'increasing': final_sample_increasing,
            'decreasing': final_sample_decreasing
        }

    print(f"Processed {sample_size} triplets and returned ordered DataFrames for each measure.")
    return result




In [8]:
# Example Measures
def measure_1(row):
    """Complexity measure: sum of lengths of premise and hypothesis."""
    return len(row['premise']) + len(row['hypothesis'])

def measure_2(row):
    """Complexity measure: difference in lengths of premise and hypothesis."""
    return abs(len(row['premise']) - len(row['hypothesis']))

def measure_3(row):
    """Complexity measure: number of unique words in the hypothesis."""
    return len(set(row['hypothesis'].split()))



In [9]:
# Usage
file_path = "/content/sampled_snli_10000.csv"
result = process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7, measures=[measure_1, measure_2, measure_3])

# Step to save increasing and decreasing DataFrames for each measure
for measure_name, dataframes in result.items():
    # Extract increasing and decreasing DataFrames
    increasing_df = dataframes['increasing']
    decreasing_df = dataframes['decreasing']

    # Save to CSV
    increasing_df.to_csv(f"/content/{measure_name}_increasing.csv", index=False)
    decreasing_df.to_csv(f"/content/{measure_name}_decreasing.csv", index=False)

    print(f"Saved {measure_name} DataFrames to CSV:")
    print(f"/content/{measure_name}_increasing.csv")
    print(f"/content/{measure_name}_decreasing.csv")


Dataset loaded: 30000 rows

Distribution of triplets across bins for measure_1:
range_bin
0    3856
1    5145
2     891
3      90
4      15
5       1
6       2
Name: count, dtype: int64

Distribution of triplets across bins for measure_2:
range_bin
0    8260
1    1576
2     133
3      26
4       3
6       2
Name: count, dtype: int64

Distribution of triplets across bins for measure_3:
range_bin
0     575
1    5559
2    3162
3     606
4      85
5       9
6       4
Name: count, dtype: int64
Processed 700 triplets and returned ordered DataFrames for each measure.
Saved measure_1 DataFrames to CSV:
/content/measure_1_increasing.csv
/content/measure_1_decreasing.csv
Saved measure_2 DataFrames to CSV:
/content/measure_2_increasing.csv
/content/measure_2_decreasing.csv
Saved measure_3 DataFrames to CSV:
/content/measure_3_increasing.csv
/content/measure_3_decreasing.csv
