In [1]:
import torch
from os import path as op
import os
import numpy as np
from collections import Counter
import pandas as pd

In [2]:
!pip install syllapy  # Install the syllapy library if needed
import syllapy

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [16]:
def count_words(sentence):
        return len(sentence.split())
def l_row(row):
    word_count = count_words(row['premise']) + count_words(row['hypothesis'])
    return word_count

def c_row(row, total_rows):
    """Readability measure using sentence length and word complexity."""
    avg_sentence_length = l_row(row)/total_rows
    avg_word_length = (word_length(row['premise']) + word_length(row['hypothesis'])) / l_row(row)
    readability = 0.39 * (avg_sentence_length/ 100) + 11.8 * (avg_word_length / 100) - 15.59
    return readability

def word_length(sentence):
    words = sentence.split()
    total_length = sum(len(word) for word in words)
    return total_length / len(words) if len(words) > 0 else 0

def sentence_length(row): # Change the argument from df to row
    """sentence length"""
    return l_row(row)

def readability(row, total_rows): # Change the argument from df to row
    """Final measure combining normalized sentence length and readability."""
    return c_row(row, total_rows)

def normalize(series):
    """Min-max normalization to scale values between 0 and 1."""
    min_val = np.min(series)
    max_val = np.max(series)
    return (series - min_val) / (max_val - min_val) if max_val > min_val else series

In [17]:
def sample_triplets(data, sample_size, num_bins):
    # Step 3.1: Divide measure_1 into ranges (bins)
    bin_edges = np.linspace(data['measure_1'].min(), data['measure_1'].max(), num_bins + 1)
    data['range_bin'] = pd.cut(data['measure_1'], bins=bin_edges, labels=False, include_lowest=True)

    # Step 3.2: Calculate bin distributions
    bin_distribution = data.groupby('range_bin')['triplet_nr'].nunique()
    print(f"Distribution of triplets across bins:\n{bin_distribution}")

    # Step 3.3: Determine how many triplets to sample from each bin
    total_triplets = data['triplet_nr'].nunique()
    triplets_per_bin = (bin_distribution / bin_distribution.sum() * 700).astype(int)

    # Ensure the sum matches exactly 700
    while triplets_per_bin.sum() < 700:
        residuals = (bin_distribution / bin_distribution.sum() * 700) - triplets_per_bin
        triplets_per_bin[residuals.idxmax()] += 1

    while triplets_per_bin.sum() > 700:
        residuals = (bin_distribution / bin_distribution.sum() * 700) - triplets_per_bin
        triplets_per_bin[residuals.idxmin()] -= 1


    # Step 3.4: Sample triplets proportionally from each bin
    sampled_triplets = []
    for bin_id, sample_count in triplets_per_bin.items():
        if sample_count > 0:
            triplets_in_bin = data[data['range_bin'] == bin_id]['triplet_nr'].unique()
            sampled_triplet_ids = np.random.choice(triplets_in_bin, size=min(sample_count, len(triplets_in_bin)), replace=False)
            sampled_triplets.append(data[data['triplet_nr'].isin(sampled_triplet_ids)])

    # Combine sampled triplets
    sampled_data = pd.concat(sampled_triplets).reset_index(drop=True)

    return sampled_data

In [20]:
def process_snli_dataset_with_measures(file_path, sample_size=700, num_bins=7):
    """
    Process an SNLI dataset for curriculum learning with nested measures and proportional sampling.

    Args:
        file_path (str): Path to the CSV file containing the dataset.
        sample_size (int): The number of triplets to sample from the dataset. Default is 700.
        num_bins (int): Number of bins for dividing measure_1 values. Default is 7.

    Returns:
        tuple: Two DataFrames - one ordered by increasing average measure_1 and one by decreasing average measure_1.
    """
    # Step 1:
    data = pd.read_csv(file_path)
    print(f"Dataset loaded: {len(data)} rows")

    data['premise'] = data['premise'].fillna("").astype(str)
    data['hypothesis'] = data['hypothesis'].fillna("").astype(str)

    print("step 1 done")

    # Step 2: CHANGE THE MEASURE
    total_rows = len(data)
    sentence_l= list(data.apply(sentence_length, axis=1))
    read = list(data.apply(lambda row: readability(row, total_rows), axis=1))
    # Normalize both measures
    normalized_length = normalize(sentence_l)
    normalized_readability = normalize( read )

    data['measure_1'] = list([l + r for l, r in zip(normalized_length, normalized_readability)])

    print("step 2 done")
    #Step 3

    final_sample = sample_triplets(data, sample_size, num_bins)

    print("step 3 done")
    # Step 4:
    triplet_avg = final_sample.groupby('triplet_nr')['measure_1'].mean().reset_index(name='triplet_avg_measure_1')

    # Merge back to keep triplet-level averages
    final_sample = final_sample.merge(triplet_avg, on='triplet_nr')
    final_sample_increasing = final_sample.sort_values(by='triplet_avg_measure_1').reset_index(drop=True)
    final_sample_decreasing = final_sample.sort_values(by='triplet_avg_measure_1', ascending=False).reset_index(drop=True)

    print("step 4 done")
    # Print bin distributions for the final samples
    increasing_bins = final_sample_increasing.groupby('range_bin')['triplet_nr'].nunique()
    print(f"Distribution of bins in increasing order sample:\n{increasing_bins}")
    print(f"Sampled {sample_size} triplets and returned two ordered DataFrames.")

    return final_sample_increasing, final_sample_decreasing


In [21]:
# Test the function with the basic measure
sample_increasing, sample_decreasing = process_snli_dataset_with_measures(file_path='/content/sampled_snli_10000.csv')

# Display the outputs
print("Sample ordered by increasing triplet average:")
print(sample_increasing.head())

#print("\nSample ordered by decreasing triplet average:")
#print(sample_decreasing.head())
sample_increasing.to_csv('sample_increasing.csv', index=False)
#sample_decreasing.to_csv('sample_decreasing.csv', index=False)

Dataset loaded: 30000 rows
step 1 done
step 2 done
Distribution of triplets across bins:
range_bin
0    8031
1    4275
2     429
3      62
4      10
5       3
6       3
Name: triplet_nr, dtype: int64
step 3 done
step 4 done
Distribution of bins in increasing order sample:
range_bin
0    562
1    385
2     45
3      5
4      1
Name: triplet_nr, dtype: int64
Sampled 700 triplets and returned two ordered DataFrames.
Sample ordered by increasing triplet average:
                           premise                         hypothesis  label  \
0  A girl rows a boat from a dock.          A boy plays on a jet ski.      2   
1  A girl rows a boat from a dock.               A girl in a rowboat.      0   
2  A girl rows a boat from a dock.  A girl rows out to meet a friend.      1   
3              A boy on a scooter.               The boy is on a bike      2   
4              A boy on a scooter.                     A boy outside.      0   

   triplet_nr  measure_1  range_bin  triplet_avg_measure