# Data Augmentation avec Pipeline et GAN PyTorch Custom

In [1]:
import sys, os  
root = os.path.abspath(os.path.join('..'))  
if root not in sys.path:  
    sys.path.insert(0, root)

In [2]:
!pip install -r ../requirements.txt



## 2 Imports et configuration

In [3]:
import numpy as np
import pandas as pd
import torch
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from nutrition_recommender.pipeline import Pipeline
from nutrition_recommender.tabular_gan import TabularGANAugmentor

In [4]:
CSV       = '../resources/Dataset_INCA2/Nomenclature_3.csv'
KG_TTL    = '../resources/kg/food_kg.ttl'
SHAPES    = '../resources/kg/shapes.ttl'

OUTPUT_DIR = '../data_augmented'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Colonnes à traiter
numeric_cols     = ['codgr','sougr','codal']
categorical_cols = ['libgr','libsougr','libal']
NUM_AUG = 1000


## 3 Exécuter le Pipeline (prétraitement + GAN + SHACL)



In [6]:
# %%
pipeline = Pipeline(CSV, KG_TTL, SHAPES)
real_df, synth_df = pipeline.run(numeric_cols, categorical_cols, NUM_AUG)


Epoch 10/100 | Loss_D: 0.1431 | Loss_G: 2.1013 | Checkpoint saved
Epoch 20/100 | Loss_D: 0.0991 | Loss_G: 2.8015 | Checkpoint saved
Epoch 30/100 | Loss_D: 0.2013 | Loss_G: 3.3110 | Checkpoint saved
Epoch 40/100 | Loss_D: 0.6542 | Loss_G: 1.1783 | Checkpoint saved
Epoch 50/100 | Loss_D: 0.4125 | Loss_G: 3.4984 | Checkpoint saved
Epoch 60/100 | Loss_D: 0.2521 | Loss_G: 3.6968 | Checkpoint saved
Epoch 70/100 | Loss_D: 0.8221 | Loss_G: 1.5243 | Checkpoint saved
Epoch 80/100 | Loss_D: 0.5306 | Loss_G: 1.5864 | Checkpoint saved
Epoch 90/100 | Loss_D: 0.8160 | Loss_G: 2.1585 | Checkpoint saved
Epoch 100/100 | Loss_D: 1.3152 | Loss_G: 1.0040 | Checkpoint saved
SHACL validation passed


### Décodage des données synthétiques (retour en texte d’origine)Décodage des synthétiques

In [7]:
synth_df.columns = ['codgr_enc','libgr_enc','sougr_enc','libsougr_enc','codal_enc','libal_enc']


In [8]:
arr_num = pipeline.prep.scaler.inverse_transform(synth_df[['codgr_enc','sougr_enc','codal_enc']].values)
df_num  = pd.DataFrame(np.rint(arr_num).astype(int), columns=numeric_cols)


In [9]:
df_num = pd.DataFrame(np.rint(arr_num).astype(int), columns=numeric_cols)


In [10]:
# b) Inverse-encoder catégoriel
df_cat = pd.DataFrame()
for enc_col, col in zip(['libgr_enc','libsougr_enc','libal_enc'], categorical_cols):
    le = pipeline.prep.encoders[col]
    codes = np.rint(synth_df[enc_col]).astype(int)
    codes = np.clip(codes, 0, len(le.classes_)-1)
    df_cat[col] = le.inverse_transform(codes)

## 5 Assemblage

In [11]:

df_synth_decoded = pd.concat([df_num, df_cat], axis=1)[
    ['codgr','libgr','sougr','libsougr','codal','libal']
]
print("Synthétique décodé :", df_synth_decoded.shape)
df_synth_decoded.head()


Synthétique décodé : (1000, 6)


Unnamed: 0,codgr,libgr,sougr,libsougr,codal,libal
0,-14,eaux,43,haricots verts et petits pois (légumes potagers),105408,lardon nature cuit
1,-18,glaces et desserts glacés,16,jambons et charcuteries en pièces,132064,limonade ou clear lime sucrée type seven'up ou...
2,-6,volaille et gibier,-226,lait en bouteille ou en brique,232048,lait fermenté au bifidus nature au lait entier...
3,-12,oeufs et dérivés,-50,lait aromatisé,137218,margarine au tournesol en barquette 80% m.g. t...
4,-7,glaces et desserts glacés,4,haricots verts et petits pois (légumes potagers),140874,lait demi-écrémé pasteurisé


## 4 Aperçu des résultats


In [12]:
print("→ Données prétraitées (réel) :", real_df.shape)
print(real_df.head(), end='\n\n')

print("→ Données augmentées (synthétique) :", synth_df.shape)
print(synth_df.head())


→ Données prétraitées (réel) : (1343, 6)
      codgr  libgr     sougr  libsougr     codal  libal
0 -1.863677     29 -0.455396        78 -0.902609    869
1 -1.863677     29 -0.455396        78 -0.902483    880
2 -1.863677     29 -0.455396        78 -0.902147    873
3 -1.863677     29 -0.455396        78 -0.898448    875
4 -1.863677     29 -0.455396        78 -0.898027    872

→ Données augmentées (synthétique) : (1000, 6)
   codgr_enc  libgr_enc  sougr_enc  libsougr_enc  codal_enc   libal_enc
0  -3.100465  17.478338   0.753921     59.738865   3.233730  705.994873
1  -3.492562  22.276657  -0.037905     61.900635   4.354136  717.485352
2  -2.423512  45.784931  -6.976610     65.585815   8.556771  688.325989
3  -2.994895  28.376635  -1.914335     63.862293   4.570794  750.691467
4  -2.519884  22.359928  -0.358653     59.580616   4.724477  678.744385


## 5 Sauvegarde des jeux de données


In [None]:
real_path  = os.path.join(OUTPUT_DIR, 'INCA2_real_preprocessed.csv')
synth_path = os.path.join(OUTPUT_DIR, f'INCA2_synthetic_{NUM_AUG}.csv')

real_df .to_csv(real_path,  index=False, sep=';')
synth_df.to_csv(synth_path, index=False, sep=';')

print(f"Données réelles sauvegardées : {real_path}")
print(f"Données synthétiques sauvegardées : {synth_path}")


Données réelles sauvegardées : ../data_augmented\INCA2_real_preprocessed.csv
Données synthétiques sauvegardées : ../data_augmented\INCA2_synthetic_1000.csv


In [14]:
decoded_path = os.path.join(OUTPUT_DIR, 'INCA2_synthetic_decoded2.csv')
df_synth_decoded.to_csv(decoded_path, index=False, sep=';')
print("Décodé enregistré :", decoded_path)

Décodé enregistré : ../data_augmented\INCA2_synthetic_decoded2.csv


## 6 Évaluation statistique du GAN

### 6.1 KS-test pour chaque colonne numérique

In [15]:
print("=== KS-Test (numériques) ===")
for c in numeric_cols:
    stat, p = ks_2samp(real_df[c], df_synth_decoded[c])
    print(f"{c}: KS={stat:.3f}, p-val={p:.3f}")


=== KS-Test (numériques) ===
codgr: KS=0.890, p-val=0.000
sougr: KS=0.532, p-val=0.000
codal: KS=0.966, p-val=0.000


### 6.2 χ²-test pour chaque colonne catégorielle

In [16]:

print("\n=== χ²-Test (catégorielles) ===")
for c in categorical_cols:
    real_counts  = real_df[c].value_counts()
    synth_counts = df_synth_decoded[c].value_counts()
    table = pd.concat([real_counts, synth_counts], axis=1).fillna(0).astype(int)
    chi2, p, _, _ = chi2_contingency(table)
    print(f"{c}: χ²={chi2:.1f}, p-val={p:.3f}")


=== χ²-Test (catégorielles) ===
libgr: χ²=2343.0, p-val=0.000
libsougr: χ²=2343.0, p-val=0.000
libal: χ²=2343.0, p-val=0.000


### 6.3 Différence de corrélation

In [17]:

corr_real  = real_df[numeric_cols].corr().values
corr_synth = df_synth_decoded[numeric_cols].corr().values
corr_diff  = np.linalg.norm(corr_real - corr_synth)
print(f"\n||Corr_real – Corr_synth||_F = {corr_diff:.3f}")


||Corr_real – Corr_synth||_F = 2.136


## 8 Checkpoints du GAN

In [19]:
print("Checkpoints disponibles dans :", pipeline.gan.checkpoint_dir)
print(os.listdir(pipeline.gan.checkpoint_dir))

Checkpoints disponibles dans : checkpoints
['discriminator_epoch10.pth', 'discriminator_epoch100.pth', 'discriminator_epoch20.pth', 'discriminator_epoch30.pth', 'discriminator_epoch40.pth', 'discriminator_epoch50.pth', 'discriminator_epoch60.pth', 'discriminator_epoch70.pth', 'discriminator_epoch80.pth', 'discriminator_epoch90.pth', 'generator_epoch10.pth', 'generator_epoch100.pth', 'generator_epoch20.pth', 'generator_epoch30.pth', 'generator_epoch40.pth', 'generator_epoch50.pth', 'generator_epoch60.pth', 'generator_epoch70.pth', 'generator_epoch80.pth', 'generator_epoch90.pth']
