**Creating Train Test and Validation based on Stratified Sampling**

In [None]:
from transformers import M2M100Tokenizer
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Load the M2M100 tokenizer
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")

# Load the Excel file into a DataFrame
file_path = "/content/34.871k.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Fill missing sentences with empty strings to avoid TypeError
for col in ['HINDI', 'MUNDARI']:
    df[col] = df[col].fillna("")

# Function to count the number of tokens in a sentence
def count_tokens(sentence):
    # ensure input is string
    text = str(sentence)
    tokens = tokenizer.tokenize(text)
    return len(tokens)

# Apply the token count function to both columns
df['Token_Count_Hindi'] = df['HINDI'].apply(count_tokens)
df['Token_Count_Mundari'] = df['MUNDARI'].apply(count_tokens)

# Compute the average token count between the two languages
df['Average_Token_Count'] = df[['Token_Count_Hindi', 'Token_Count_Mundari']].mean(axis=1)

# Sort by average token count
df_sorted = df.sort_values(by='Average_Token_Count').reset_index(drop=True)

# Perform KMeans clustering into 15 clusters based on the average token count
kmeans = KMeans(n_clusters=15, random_state=42)
df_sorted['Cluster'] = kmeans.fit_predict(df_sorted[['Average_Token_Count']])

# Initialize DataFrames for train, test, and validation sets
test_data = pd.DataFrame()
validation_data = pd.DataFrame()
train_data = pd.DataFrame()

# For each cluster, split into 10% validation, then 5% of remaining as test, rest as train
for cluster in range(15):
    cluster_data = df_sorted[df_sorted['Cluster'] == cluster]

    # 10% -> validation
    validation_samples = cluster_data.sample(frac=0.10, random_state=42)
    remaining_after_val = cluster_data.drop(validation_samples.index)

    # 5% of the remaining -> test
    test_samples = remaining_after_val.sample(frac=0.05, random_state=42)
    train_samples = remaining_after_val.drop(test_samples.index)

    # Append to overall sets
    validation_data = pd.concat([validation_data, validation_samples])
    test_data = pd.concat([test_data, test_samples])
    train_data = pd.concat([train_data, train_samples])

# Drop helper columns before saving
cols_to_drop = ['Token_Count_Hindi', 'Token_Count_Mundari', 'Average_Token_Count', 'Cluster']
validation_data = validation_data.drop(columns=cols_to_drop)
test_data       = test_data.drop(columns=cols_to_drop)
train_data      = train_data.drop(columns=cols_to_drop)

# Save the splits to Excel files
test_data.to_excel("test_data.xlsx", index=False)
train_data.to_excel("train_data.xlsx", index=False)
validation_data.to_excel("validation_data.xlsx", index=False)
