<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/main/calculate_measure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook processes the SNLI dataset for a CL project. The key steps are:



1. Load Dataset: Read the dataset from a CSV file.
2. Compute Measure Function: Calculate the chosen complexity (example: the sum of the lengths of the premise and hypothesis.)
3. Sample Triplets Function: Divide data into bins (deafult = 7), Calculate distributions, Determine how many triplets to sample from each bin, Sample tri
4. Apply avg values of the measure from each triplet
5. Order Triplets: Arrange the sampled triplets by increasing and decreasing complexity.





In [1]:
import pandas as pd
import numpy as np

In [16]:
def process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7):
    """
    Process an SNLI dataset for curriculum learning with nested measures and proportional sampling.

    Args:
        file_path (str): Path to the CSV file containing the dataset.
        sample_size (int): The number of triplets to sample from the dataset. Default is 700.
        num_bins (int): Number of bins for dividing measure_1 values. Default is 7.

    Returns:
        tuple: Two DataFrames - one ordered by increasing average measure_1 and one by decreasing average measure_1.
    """
    # Step 1:
    data = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(data)} rows")

    data['premise'] = data['premise'].fillna("").astype(str)
    data['hypothesis'] = data['hypothesis'].fillna("").astype(str)

    # Step 2: CHANGE THE MEASURE
    def measure_1(row):
        """Example complexity measure: sum of lengths of premise and hypothesis."""
        return len(row['premise']) + len(row['hypothesis'])

    data['measure_1'] = data.apply(measure_1, axis=1)

    # Step 3:
    def sample_triplets(data, sample_size, num_bins):

        # Step 3.1: Divide measure_1 into ranges (bins)
        bin_edges = np.linspace(data['measure_1'].min(), data['measure_1'].max(), num_bins + 1)
        data['range_bin'] = pd.cut(data['measure_1'], bins=bin_edges, labels=False, include_lowest=True)

        # Step 3.2: Calculate bin distributions
        bin_distribution = data.groupby('range_bin')['triplet_nr'].nunique()
        print(f"Distribution of triplets across bins:\n{bin_distribution}")

        # Step 3.3: Determine how many triplets to sample from each bin
        total_triplets = data['triplet_nr'].nunique()
        triplets_per_bin = (bin_distribution / total_triplets * sample_size).astype(int)

        # Step 3.4: Sample triplets proportionally from each bin
        sampled_triplets = []
        for bin_id, sample_count in triplets_per_bin.items():
            if sample_count > 0:
                triplets_in_bin = data[data['range_bin'] == bin_id]['triplet_nr'].unique()
                sampled_triplet_ids = np.random.choice(triplets_in_bin, size=min(sample_count, len(triplets_in_bin)), replace=False)
                sampled_triplets.append(data[data['triplet_nr'].isin(sampled_triplet_ids)])

        # Combine sampled triplets
        sampled_data = pd.concat(sampled_triplets).reset_index(drop=True)
        return sampled_data

    final_sample = sample_triplets(data, sample_size, num_bins)

    # Step 4:
    triplet_avg = final_sample.groupby('triplet_nr')['measure_1'].mean().reset_index(name='triplet_avg_measure_1')

    # Merge back to keep triplet-level averages
    final_sample = final_sample.merge(triplet_avg, on='triplet_nr')
    final_sample_increasing = final_sample.sort_values(by='triplet_avg_measure_1').reset_index(drop=True)
    final_sample_decreasing = final_sample.sort_values(by='triplet_avg_measure_1', ascending=False).reset_index(drop=True)

    # Print bin distributions for the final samples
    increasing_bins = final_sample_increasing.groupby('range_bin')['triplet_nr'].nunique()
    print(f"Distribution of bins in increasing order sample:\n{increasing_bins}")
    print(f"Sampled {sample_size} triplets and returned two ordered DataFrames.")

    return final_sample_increasing, final_sample_decreasing


In [18]:
# Test the function with the basic measure
sample_increasing, sample_decreasing = process_snli_dataset_with_measures(file_path='/content/sampled_snli_10000.csv')

# Display the outputs
print("Sample ordered by increasing triplet average:")
print(sample_increasing.head())

#print("\nSample ordered by decreasing triplet average:")
#print(sample_decreasing.head())


Dataset loaded: 30000 rows
Distribution of triplets across bins:
range_bin
0    5495
1    6368
2    1362
3     179
4      26
5       5
6       4
Name: triplet_nr, dtype: int64
Distribution of bins in increasing order sample:
range_bin
0    538
1    657
2    171
3     26
4      3
5      1
Name: triplet_nr, dtype: int64
Sampled 700 triplets and returned two ordered DataFrames.
Sample ordered by increasing triplet average:
              premise                   hypothesis  label  triplet_nr  \
0   Three black dogs.  three dogs miss their owner      1      174253   
1   Three black dogs.          three dogs together      0      174253   
2   Three black dogs.            girl plays hockey      2      174253   
3  A man fly fishing.            A man is fishing.      0      135787   
4  A man fly fishing.    A man is cooking a steak.      2      135787   

   measure_1  range_bin  triplet_avg_measure_1  
0         44          0                   38.0  
1         36          0                