In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


df = pd.read_csv("mmlu_pro_stem_w_filtered_numerical_maj_w_entropyphi4.tsv", sep="\t")


train_valid_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
test_df.to_csv("test_combined_masj.tsv", sep="\t", index=False)
print(f"Тестовый датасет: {len(test_df)} примеров сохранён в 'test.tsv'.")

train_valid_df = train_valid_df.sort_values(by="masj_num_complexity", ascending=False)
N = len(train_valid_df)
print(f"Всего обучающих+валидационных примеров: {N}")

def split_and_save_data(
    df,
    complexity_col='masj_num_complexity',
    thresholds=[(None, 0.4, 'easy'), (0.4, 0.6, 'middle'), (0.6, None, 'hard')],
    test_size=0.1,
    random_state=42,
    output_prefix=""
    ):
    """
    Разделяет данные по уровням сложности и сохраняет в TSV-файлы.
    """
    for lower, upper, suffix in thresholds:
        if lower is None:
            filtered = df[df[complexity_col] < upper]
        elif upper is None:
            filtered = df[df[complexity_col] >= lower]
        else:
            filtered = df[(df[complexity_col] >= lower) & (df[complexity_col] < upper)]
        
        filtered = filtered.reset_index(drop=True)
        
        train, valid = train_test_split(filtered, test_size=test_size, random_state=random_state)
        
        train.to_csv(f"{output_prefix}train_df_{suffix}.tsv", sep='\t', index=False)
        valid.to_csv(f"{output_prefix}valid_df_{suffix}.tsv", sep='\t', index=False)

split_and_save_data(train_valid_df, complexity_col='masj_num_complexity')