# Train/Val dataset
Create 5-fold stratified cross-validation

In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [13]:
df_train = pd.read_csv("../gen/data_def_train.csv")

print("Training Dataset size:", len(df_train))
df_train.head()

Training Dataset size: 206068


  df_train = pd.read_csv("../gen/data_def_train.csv")


Unnamed: 0,barcode,product_name,nutriscore,nova_group,mapped_category,labels_string,n_labels_total,labels_1_intrinsic,n_1_intrinsic,labels_2_extrinsic,...,n_3_packaging,brands,countries,n_additives,n_ingredients,main_mapped_category,label_tier,stratify_raw,stratify,split
0,554004509,Pain de mie sans gluten,c,4,3.1 Bread,NO_GLUTEN|NO_PRESERVATIVES,2,NO_GLUTEN|NO_PRESERVATIVES,2,,...,0,Genius,france,2.0,22.0,3. Cereals & Starches,1,3. Cereals & Starches | 4,3. Cereals & Starches | 4,train
1,2000002466,2 MINI BAGUETTES SANS GLUTEN,c,4,3.1 Bread,CROSSED_GRAIN|CROSSED_GRAIN_DZG|EU_ORGANIC_LOG...,5,NO_GLUTEN,1,CROSSED_GRAIN|CROSSED_GRAIN_DZG|EU_ORGANIC_LOG...,...,0,Nature & Cie,france,2.0,15.0,3. Cereals & Starches,2,3. Cereals & Starches | 4,3. Cereals & Starches | 4,train
2,2000003197,Véritable Andouille de Guémené,e,4,5.1 Processed meat,SMOKED_BEECH_WOOD,1,SMOKED_BEECH_WOOD,1,,...,0,Amand Terroir,france,2.0,10.0,"5. Meat, Fish & Eggs",1,"5. Meat, Fish & Eggs | 4","5. Meat, Fish & Eggs | 4",train
3,2000010775,Escalope soja et blé,b,3,5.4 Meat,EVU_VEGETARIAN|VEGETARIAN,2,,0,EVU_VEGETARIAN|VEGETARIAN,...,0,Herta,france,0.0,32.0,"5. Meat, Fish & Eggs",1,"5. Meat, Fish & Eggs | 3","5. Meat, Fish & Eggs | 3",train
4,8112100281,Pure Sesame Oil,c,2,6.1 Fats & oils,NON_GMO,1,,0,NON_GMO,...,0,"Taiwan Sunlife Corp., Foreway",united-states|world,0.0,1.0,6. Fats & Sauces,1,6. Fats & Sauces | 2,6. Fats & Sauces | 2,train


In [20]:
# Stratified 5-fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
targets = ["nova_group", "nutriscore"]

fold_assignments = pd.DataFrame({"index": df_train.index})

for target in targets:
    fold_col = f"{target}_fold"
    fold_assignments[fold_col] = -1  # placeholder

    for fold, (train_idx, val_idx) in enumerate(cv.split(df_train, df_train[target])):
        fold_assignments.loc[val_idx, fold_col] = fold + 1

df_train = pd.concat([df_train, fold_assignments.drop(columns="index")], axis=1)

In [22]:
for target in targets:
    print(f"\n=== Stratified 5-Fold distributions for {target} ===")

    # collect counts per fold
    dist_df = pd.DataFrame()
    for i in range(1, 6):
        subset = df_train[df_train[f"{target}_fold"] == i]
        dist = subset[target].value_counts(normalize=True).round(3)
        dist_df[f"Fold {i}"] = dist

    # sort order: numeric (1>4) or alphabetic (A>E)
    if target == "nova_group":
        dist_df = dist_df.sort_index(key=lambda x: x.astype(int))
    else:
        dist_df = dist_df.sort_index(key=lambda x: x.str.upper())

    # add total column (average across folds)
    dist_df["Mean"] = dist_df.mean(axis=1).round(3)

    print(dist_df.to_string())


=== Stratified 5-Fold distributions for nova_group ===
            Fold 1  Fold 2  Fold 3  Fold 4  Fold 5   Mean
nova_group                                               
1            0.147   0.147   0.147   0.146   0.146  0.147
2            0.041   0.041   0.041   0.041   0.041  0.041
3            0.257   0.257   0.257   0.257   0.257  0.257
4            0.555   0.555   0.555   0.555   0.555  0.555

=== Stratified 5-Fold distributions for nutriscore ===
            Fold 1  Fold 2  Fold 3  Fold 4  Fold 5   Mean
nutriscore                                               
a            0.159   0.159   0.159   0.159   0.159  0.159
b            0.123   0.123   0.123   0.123   0.123  0.123
c            0.229   0.229   0.229   0.229   0.229  0.229
d            0.236   0.236   0.236   0.236   0.236  0.236
e            0.253   0.253   0.253   0.253   0.253  0.253


In [24]:
print("=== Dataset Overview ===")
print(f"Total rows in training dataset: {len(df_train):,}")
print(f"Columns: {', '.join(df_train.columns[:10])} ...")  
print("\nFirst 10 rows:")
display(df_train.head(10))  

print("\n=== Fold sizes ===")
for target in targets:
    print(f"\nTarget: {target}")
    fold_sizes = df_train[f"{target}_fold"].value_counts().sort_index()
    for fold, size in fold_sizes.items():
        print(f"  Fold {fold}: {size:,} rows ({size/len(df_train)*100:.2f} %)")
    print(f"  Mean per fold: {fold_sizes.mean():,.0f} rows\n")

=== Dataset Overview ===
Total rows in training dataset: 206,068
Columns: barcode, product_name, nutriscore, nova_group, mapped_category, labels_string, n_labels_total, labels_1_intrinsic, n_1_intrinsic, labels_2_extrinsic ...

First 10 rows:


Unnamed: 0,barcode,product_name,nutriscore,nova_group,mapped_category,labels_string,n_labels_total,labels_1_intrinsic,n_1_intrinsic,labels_2_extrinsic,...,countries,n_additives,n_ingredients,main_mapped_category,label_tier,stratify_raw,stratify,split,nova_group_fold,nutriscore_fold
0,554004509,Pain de mie sans gluten,c,4,3.1 Bread,NO_GLUTEN|NO_PRESERVATIVES,2,NO_GLUTEN|NO_PRESERVATIVES,2,,...,france,2.0,22.0,3. Cereals & Starches,1,3. Cereals & Starches | 4,3. Cereals & Starches | 4,train,4,3
1,2000002466,2 MINI BAGUETTES SANS GLUTEN,c,4,3.1 Bread,CROSSED_GRAIN|CROSSED_GRAIN_DZG|EU_ORGANIC_LOG...,5,NO_GLUTEN,1,CROSSED_GRAIN|CROSSED_GRAIN_DZG|EU_ORGANIC_LOG...,...,france,2.0,15.0,3. Cereals & Starches,2,3. Cereals & Starches | 4,3. Cereals & Starches | 4,train,4,3
2,2000003197,Véritable Andouille de Guémené,e,4,5.1 Processed meat,SMOKED_BEECH_WOOD,1,SMOKED_BEECH_WOOD,1,,...,france,2.0,10.0,"5. Meat, Fish & Eggs",1,"5. Meat, Fish & Eggs | 4","5. Meat, Fish & Eggs | 4",train,1,1
3,2000010775,Escalope soja et blé,b,3,5.4 Meat,EVU_VEGETARIAN|VEGETARIAN,2,,0,EVU_VEGETARIAN|VEGETARIAN,...,france,0.0,32.0,"5. Meat, Fish & Eggs",1,"5. Meat, Fish & Eggs | 3","5. Meat, Fish & Eggs | 3",train,1,3
4,8112100281,Pure Sesame Oil,c,2,6.1 Fats & oils,NON_GMO,1,,0,NON_GMO,...,united-states|world,0.0,1.0,6. Fats & Sauces,1,6. Fats & Sauces | 2,6. Fats & Sauces | 2,train,4,4
5,8112100298,Pure Sesame Oil,c,2,6.1 Fats & oils,NON_GMO,1,,0,NON_GMO,...,united-states|world,0.0,1.0,6. Fats & Sauces,1,6. Fats & Sauces | 2,6. Fats & Sauces | 2,train,1,1
6,8274000061,Original Ginger Beer,e,3,1.3 Sweetened beverages,NON_GMO,1,,0,NON_GMO,...,united-states|world,0.0,8.0,1. Beverages,1,1. Beverages | 3,1. Beverages | 3,train,5,4
7,8274000078,Ginger Beer,e,3,1.3 Sweetened beverages,NO_GLUTEN|NO_PRESERVATIVES|NON_GMO,3,NO_GLUTEN|NO_PRESERVATIVES,2,NON_GMO,...,france|united-states|world,0.0,8.0,1. Beverages,2,1. Beverages | 3,1. Beverages | 3,train,2,5
8,8295660770,Organic Balsamic Vinegar of Modena,b,1,6.2 Dressings and sauces,EU_ORGANIC_LOGO|NO_PRESERVATIVES|NON_GMO|USDA_...,4,NO_PRESERVATIVES,1,EU_ORGANIC_LOGO|NON_GMO|USDA_ORGANIC,...,united-states|world,0.0,2.0,6. Fats & Sauces,2,6. Fats & Sauces | 1,6. Fats & Sauces | 1,train,5,1
9,8295661074,Balsamic Vinegar Of Modena Gold Eagle,d,1,6.2 Dressings and sauces,NON_GMO,1,,0,NON_GMO,...,united-states|world,0.0,3.0,6. Fats & Sauces,1,6. Fats & Sauces | 1,6. Fats & Sauces | 1,train,3,4



=== Fold sizes ===

Target: nova_group
  Fold 1: 41,214 rows (20.00 %)
  Fold 2: 41,214 rows (20.00 %)
  Fold 3: 41,214 rows (20.00 %)
  Fold 4: 41,213 rows (20.00 %)
  Fold 5: 41,213 rows (20.00 %)
  Mean per fold: 41,214 rows


Target: nutriscore
  Fold 1: 41,214 rows (20.00 %)
  Fold 2: 41,214 rows (20.00 %)
  Fold 3: 41,214 rows (20.00 %)
  Fold 4: 41,213 rows (20.00 %)
  Fold 5: 41,213 rows (20.00 %)
  Mean per fold: 41,214 rows



In [28]:
# Collect fold size stats for all targets
fold_summary = []

for target in targets:
    fold_sizes = df_train[f"{target}_fold"].value_counts().sort_index()
    stats = fold_sizes.describe().round(2).to_dict()

    fold_summary.append({
        "target": target,
        "total_rows": len(df_train),
        "mean_fold_size": stats["mean"],
        "std_fold_size": stats["std"],
        "min_fold_size": stats["min"],
        "max_fold_size": stats["max"],
        "cv_percent": round(stats["std"] / stats["mean"] * 100, 3),
    })

fold_summary_df = pd.DataFrame(fold_summary)

print("\n=== Fold Size Summary ===")
print(fold_summary_df.to_string(index=False))


=== Fold Size Summary ===
    target  total_rows  mean_fold_size  std_fold_size  min_fold_size  max_fold_size  cv_percent
nova_group      206068         41213.6           0.55        41213.0        41214.0       0.001
nutriscore      206068         41213.6           0.55        41213.0        41214.0       0.001


In [30]:
df_train.to_csv("../gen/data_def_train_folds.csv", index=False)