# GAN Data Augmentation 

In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import os


## 1) CHARGEMENT & PRETRAITEMENT DU DATASET INCA2

In [3]:
csv_file = "../resources/Dataset_INCA2/Nomenclature_3.csv"

df = pd.read_csv(csv_file, sep=';', encoding="windows-1252")

print("Aperçu du dataset initial :")
print(df.head())

Aperçu du dataset initial :
   codgr                       libgr  sougr libsougr  codal  \
0      1  pain et panification sèche      1     pain   7001   
1      1  pain et panification sèche      1     pain   7004   
2      1  pain et panification sèche      1     pain   7012   
3      1  pain et panification sèche      1     pain   7100   
4      1  pain et panification sèche      1     pain   7110   

                                     libal  
0                            pain baguette  
1                       pain grillé maison  
2  pain courant français boule à la levure  
3                  pain de campagne ou bis  
4       pain complet ou intégral artisanal  


In [4]:
numeric_cols = ['codgr', 'sougr', 'codal']  
cat_cols = ['libgr', 'libsougr', 'libal']  


### Encodage des colonnes catégorielles


In [5]:
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

### Rassemblement des données encodées dans un tableau NumPy

In [6]:
all_cols = numeric_cols + cat_cols
data_array = df[all_cols].values.astype(np.float32)

print("\nAperçu du dataset encodé :")
print(data_array[:5])
print("Shape =", data_array.shape)


Aperçu du dataset encodé :
[[1.000e+00 1.000e+00 7.001e+03 2.900e+01 7.800e+01 8.690e+02]
 [1.000e+00 1.000e+00 7.004e+03 2.900e+01 7.800e+01 8.800e+02]
 [1.000e+00 1.000e+00 7.012e+03 2.900e+01 7.800e+01 8.730e+02]
 [1.000e+00 1.000e+00 7.100e+03 2.900e+01 7.800e+01 8.750e+02]
 [1.000e+00 1.000e+00 7.110e+03 2.900e+01 7.800e+01 8.720e+02]]
Shape = (1343, 6)



# Colonnes : 
# - `nomen`  : c'est Nomenclature pour chaque profil
# - `nojour` : jour d’enregistrement (1–7)  
# - `numlig` : numéro de ligne dans le carnet (int)  
# - `codal`  : code de l’aliment (int)  


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from nutrition_recommender.tabular_gan import TabularGANAugmentor


In [14]:
import importlib
import nutrition_recommender.tabular_gan as tg
importlib.reload(tg)
from nutrition_recommender.tabular_gan import TabularGANAugmentor


In [15]:

CSV = '../resources/Dataset_INCA2/Table_conso.csv'  
OUTPUT_DIR = '../data_augmented'
os.makedirs(OUTPUT_DIR, exist_ok=True)

cols = ['nomen','nojour','numlig','codal']
NUM_SYNTH = 50000  


In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


cols = ['codgr','sougr','numlig','nojour','codal']
df = pd.read_csv(CSV, sep=';', encoding='windows-1252', usecols=cols)


print("NaN par colonne avant nettoyage :")
print(df.isnull().sum())


df = df.dropna(subset=cols).reset_index(drop=True)


df = df.astype(float)


scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df),
    columns=cols
)


df_scaled = df_scaled.clip(-3.0, 3.0)


print("NaN après nettoyage :", df_scaled.isnull().sum().sum())
print("Inf après nettoyage :", np.isinf(df_scaled.values).sum())
assert not df_scaled.isnull().any().any(), "Il y a encore des NaN"
assert not np.isinf(df_scaled.values).any(),       "Il y a encore des Inf"

df_scaled.head()


NaN par colonne avant nettoyage :
numlig    1
nojour    0
codgr     0
sougr     0
codal     0
dtype: int64
NaN après nettoyage : 0
Inf après nettoyage : 0


Unnamed: 0,codgr,sougr,numlig,nojour,codal
0,-1.74623,1.060549,0.953376,1.809129,-0.338286
1,-1.746223,1.060549,0.623221,-0.578367,0.224435
2,-1.746217,1.060549,-1.770401,-0.578367,-0.823728
3,-1.746211,1.060549,-0.614859,1.809129,-0.404419
4,-1.746205,1.060549,-0.119627,-0.578367,0.005137


In [17]:

LATENT_DIM          = 16
HIDDEN_DIM          = 64
EPOCHS              = 100
BATCH_SIZE          = 256
CHECKPOINT_DIR      = os.path.join(OUTPUT_DIR, 'checkpoints')
CHECKPOINT_INTERVAL = 20

os.makedirs(CHECKPOINT_DIR, exist_ok=True)


In [None]:
gan = TabularGANAugmentor(
    latent_dim=16,
    hidden_dim=64,
    epochs=100,
    batch_size=256,
    checkpoint_dir=os.path.join(OUTPUT_DIR,'checkpoints'),
    checkpoint_interval=1   
)
gan.fit(df_scaled)


In [None]:
 
synth_scaled = gan.sample(NUM_SYNTH).values  
print("Synth_scaled shape:", synth_scaled.shape)


🔹 Synth_scaled shape: (50000, 5)


In [61]:

synth_inv = scaler.inverse_transform(synth_scaled)  

synth_int = np.rint(synth_inv).astype(int)



synth_int[:, cols.index('nojour')] = np.clip(
    synth_int[:, cols.index('nojour')], 1, 7
)


df_synth = pd.DataFrame(synth_int, columns=cols)  
print(" Synth_int shape:", df_synth.shape)
df_synth.head()


 Synth_int shape: (50000, 5)


Unnamed: 0,codgr,sougr,numlig,nojour,codal
0,402106,4,29,3,17506
1,347866,1,36,1,17772
2,235581,6,2,1,7527
3,380118,6,24,1,91710
4,546510,1,12,2,11943


In [None]:
from pathlib import Path


OUTPUT_DIR = Path('../data_augmented')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


out_path = OUTPUT_DIR / 'INCA2_synth_5cols.csv'
out_csv  = out_path.as_posix()   


df_synth.to_csv(out_csv, sep=';', index=False, encoding='utf-8')

print("→ Données synthétiques sauvegardées dans", out_csv)


→ Données synthétiques sauvegardées dans ../data_augmented/INCA2_synth_5cols.csv


In [None]:

df_synth = gan.sample(len(df_scaled))

df_synth.columns = df_scaled.columns


In [46]:
from scipy.stats import ks_2samp

print("=== KS-Test entre réel et synthétique (sur les données normalisées) ===")
for c in df_scaled.columns:
    stat, pval = ks_2samp(df_scaled[c], df_synth[c])
    print(f"Col '{c}': KS-stat={stat:.3f}, p-val={pval:.3e}")


=== KS-Test entre réel et synthétique (sur les données normalisées) ===
Col 'codgr': KS-stat=0.031, p-val=5.266e-231
Col 'sougr': KS-stat=0.130, p-val=0.000e+00
Col 'numlig': KS-stat=0.115, p-val=0.000e+00
Col 'nojour': KS-stat=0.350, p-val=0.000e+00
Col 'codal': KS-stat=0.132, p-val=0.000e+00


In [None]:

import os
import pandas as pd
import numpy as np
from ctgan import CTGAN
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from scipy.stats import ks_2samp, chi2_contingency
from pathlib import Path



In [22]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim


CSV = "resources/Dataset_INCA2/Table_conso.csv"   
df = pd.read_csv(CSV, sep=';', usecols=['nomen','codal','nojour'], encoding='latin1')
print("✅ Chargées :", df.shape)


le = LabelEncoder()
df['nomen_enc'] = le.fit_transform(df['nomen'].astype(str))


num_cols = ['nomen_enc','codal','nojour']
df_num  = df[num_cols].astype(float)


scaler      = StandardScaler()
df_scaled   = pd.DataFrame(scaler.fit_transform(df_num), columns=num_cols)

✅ Chargées : (541526, 3)


In [23]:

gan = TabularGANAugmentor(epochs=100, batch_size=256, checkpoint_interval=20)
gan.fit(df_scaled)

Epoch 20/100 | Loss_D: 0.6627 | Loss_G: 0.7393
Epoch 40/100 | Loss_D: 0.6143 | Loss_G: 0.8359
Epoch 60/100 | Loss_D: 0.5923 | Loss_G: 0.8905
Epoch 80/100 | Loss_D: 0.5820 | Loss_G: 0.9151
Epoch 100/100 | Loss_D: 0.5783 | Loss_G: 0.9194


In [24]:
N_SYN = len(df_scaled)
syn_scaled = gan.sample(N_SYN)

In [26]:
synth_inv = scaler.inverse_transform(syn_scaled)
synth_int = np.rint(synth_inv).astype(int)

i_nom = num_cols.index('nomen_enc')

n_cls = len(le.classes_)
synth_int[:, i_nom] = np.clip(synth_int[:, i_nom], 0, n_cls-1)

i_noj = num_cols.index('nojour')
synth_int[:, i_noj] = np.clip(synth_int[:, i_noj], 1, 7)

df_syn = pd.DataFrame(synth_int, columns=num_cols)
df_syn['nomen']  = le.inverse_transform(df_syn['nomen_enc'])
df_syn['codal']  = df_syn['codal'].astype(int)
df_syn['nojour'] = df_syn['nojour'].astype(int)
df_syn = df_syn[['nomen','codal','nojour']]


In [27]:

OUT = "../data_augmented/INCA2_synth_3cols2.csv"
os.makedirs(os.path.dirname(OUT), exist_ok=True)
df_syn.to_csv(OUT, index=False, sep=';')
print("→ Synthétiques enregistrées dans", OUT)

→ Synthétiques enregistrées dans ../data_augmented/INCA2_synth_3cols2.csv


In [28]:

print("\n=== KS-tests ===")
for c in ['codal','nojour']:
    stat,p = ks_2samp(df[c], df_syn[c])
    print(f"{c:6s} : KS={stat:.3f}, p-val={p:.3e}")


ct = pd.crosstab(df['nomen'], df_syn['nomen'])
χ2, p, *_ = chi2_contingency(ct, correction=False)
print(f"\nnomen : χ²={χ2:.1f}, p-val={p:.3e}")

X = pd.concat([df[['codal','nojour']], df_syn[['codal','nojour']]], ignore_index=True)
y = np.array([0]*len(df) + [1]*len(df_syn))
clf = LogisticRegression(max_iter=500)
score = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(f"\nROC AUC = {score.mean():.3f} ± {score.std():.3f}")


r_real  = df[['codal','nojour']].corr().values
r_syn   = df_syn[['codal','nojour']].corr().values
fro = np.linalg.norm(r_real - r_syn)
print(f"\n||Corr_real – Corr_syn||_F = {fro:.3f}")


=== KS-tests ===
codal  : KS=0.175, p-val=0.000e+00
nojour : KS=0.110, p-val=0.000e+00

nomen : χ²=16313456.4, p-val=6.768e-01

ROC AUC = 0.535 ± 0.003

||Corr_real – Corr_syn||_F = 0.036


## Evaluation : 

In [40]:
# %% 1) Imports
import os
import pandas as pd

# %% 2) Chemins
REAL_PATH  = "resources/Dataset_INCA2/Table_conso.csv"
SYN_PATH   = "../data_augmented/INCA2_synth_3cols2.csv"

# %% 3) Chargement
real = pd.read_csv(
    REAL_PATH,
    sep=';',
    usecols=['nomen','codal','nojour'],
    encoding='windows-1252',
    low_memory=False
)
syn  = pd.read_csv(
    SYN_PATH,
    sep=';',
    usecols=['nomen','codal','nojour'],
    encoding='windows-1252',
    low_memory=False
)

# %% 4) GroupBy pour compter les occurrences
grp_real = (
    real
    .groupby(['nomen','codal','nojour'])
    .size()
    .reset_index(name='count_real')
)
grp_syn = (
    syn
    .groupby(['nomen','codal','nojour'])
    .size()
    .reset_index(name='count_synth')
)

# %% 5) Merge et remplissage des zéros
eval_df = pd.merge(
    grp_real, grp_syn,
    on=['nomen','codal','nojour'],
    how='outer'
).fillna(0)

# %% 6) Diagnostic au niveau de chaque nourriture (nomen)
#     Moyennes des counts par nomen
summary = (
    eval_df
    .groupby('nomen')
    .agg(
        mean_real  = ('count_real' , 'mean'),
        mean_synth = ('count_synth', 'mean'),
        std_real   = ('count_real' , 'std'),
        std_synth  = ('count_synth', 'std'),
    )
    .reset_index()
)
# Ajouter ratio et écart absolu
summary['ratio']    = summary['mean_synth'] / summary['mean_real'].replace(0, pd.NA)
summary['diff_mean']= summary['mean_synth'] - summary['mean_real']

# %% 7) Affichage des premiers résultats
pd.set_option('display.float_format', '{:.2f}'.format)
print("=== Comparatif des moyennes de fréquences par 'nomen' ===")
print(summary.head(10))

# %% 8) Quelques indicateurs globaux
overall = {
    'overall_mean_real' : summary['mean_real'].mean(),
    'overall_mean_synth': summary['mean_synth'].mean(),
    'overall_ratio'     : summary['mean_synth'].sum() / summary['mean_real'].sum()
}
print("\n=== Indicateurs globaux ===")
for k,v in overall.items():
    print(f"{k:20s}: {v:.2f}")


=== Comparatif des moyennes de fréquences par 'nomen' ===
    nomen  mean_real  mean_synth  std_real  std_synth  ratio  diff_mean
0  110006       1.35        0.00      0.68       0.00   0.00      -1.35
1  110007       1.27        0.00      0.81       0.00   0.00      -1.27
2  110020       1.08        0.00      0.30       0.00   0.00      -1.08
3  110021       1.38        0.00      1.03       0.00   0.00      -1.38
4  110025       1.26        0.00      0.50       0.00   0.00      -1.26
5  110034       1.20        0.00      0.58       0.00   0.00      -1.20
6  110046       1.17        0.00      0.44       0.00   0.00      -1.17
7  110057       1.44        0.00      0.97       0.00   0.00      -1.44
8  110067       1.18        0.00      0.56       0.00   0.00      -1.18
9  110071       1.62        0.00      1.25       0.00   0.00      -1.62

=== Indicateurs globaux ===
overall_mean_real   : 0.56
overall_mean_synth  : 0.54
overall_ratio       : 0.95


In [None]:
import os
import pandas as pd
import openai

## Il faut utiliser API OpenAI pour recevoir les recommandations de Gpt-3.5
## Il faut un compte OpenAI et une clé API
#openai.api_key = ""

IN_SYNTH = "../data_augmented/INCA2_synth_3cols2.csv"
df_synth = pd.read_csv(IN_SYNTH, sep=';')
print("Synthétique :", df_synth.shape)
df_synth.head(5)


def classify_profile(menu_items: list[str]) -> str:
    """
    Renvoie un profil nutritionnel pour un menu donné.
    Choix : végétarien, obèse, diabétique, sain.
    """
    messages = [
        {"role": "system", "content": "Vous êtes un expert en nutrition."},
        {"role": "user",   "content":
            f"Pour ce menu quotidien : {menu_items}\n"
            "Quel est le profil le plus adapté parmi : végétarien, obèse, diabétique, sain ? "
            "Répondez par un seul mot."}
    ]
    resp = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0
    )
    return resp.choices[0].message.content.strip()

def recommend(profile: str, menu_items: list[str]) -> str:
    """
    Propose une recommandation adaptée au profil et au menu.
    """
    messages = [
        {"role": "system", "content": "Vous êtes nutritionniste et diététicien."},
        {"role": "user",   "content":
            f"Profil patient : {profile}.\n"
            f"Menu consommé : {menu_items}\n"
            "Donnez une phrase de recommandation nutritionnelle adaptée."}
    ]
    resp = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.7,
        max_tokens=80
    )
    return resp.choices[0].message.content.strip()

print("\n--- Tests rapides ---")
for idx, row in df_synth.head(5).iterrows():
    menu   = [str(row['codal'])]
    profil = classify_profile(menu)
    reco   = recommend(profil, menu)
    print(f"\nÉchantillon #{idx}")
    print(" Menu codal :", menu)
    print(" Profil     :", profil)
    print(" Reco       :", reco)


df_eval = []
for (codal, nojour), grp in df_synth.groupby(['codal','nojour']):
    menu = [str(grp.iloc[0]['codal'])]
    prof = classify_profile(menu)
    df_eval.append({'codal': codal, 'nojour': nojour, 'profil': prof})
df_eval = pd.DataFrame(df_eval)
print("\nRépartition des profils détectés :")
print(df_eval['profil'].value_counts())


OUT = "../data_augmented/INCA2_synth_3cols2_with_profiles.csv"
df_out = df_synth.merge(df_eval, on=['codal','nojour'], how='left')
os.makedirs(os.path.dirname(OUT), exist_ok=True)
df_out.to_csv(OUT, index=False, sep=';')
print("\n→ Données enrichies enregistrées dans", OUT)


Synthétique : (541526, 3)

--- Tests rapides ---

Échantillon #0
 Menu codal : ['18297']
 Profil     : Sain
 Reco       : Il est important de maintenir une alimentation équilibrée pour rester en bonne santé. Assurez-vous d'inclure une variété d'aliments dans vos repas, en privilégiant les fruits, les légumes, les protéines maigres et les céréales complètes. Pensez également à rester bien hydraté en

Échantillon #1
 Menu codal : ['17155']
 Profil     : Sain
 Reco       : Il est important de maintenir une alimentation équilibrée pour rester en bonne santé. Assurez-vous de consommer une variété d'aliments issus des différents groupes alimentaires pour obtenir tous les nutriments essentiels dont votre corps a besoin. N'oubliez pas de rester hydraté en buvant suffisamment d'eau

Échantillon #2
 Menu codal : ['29395']
 Profil     : Sain
 Reco       : Il est important de maintenir une alimentation équilibrée et variée pour rester en bonne santé. Pensez à inclure une diversité d'aliments tels 

KeyboardInterrupt: 