In [5]:
import pandas as pd

In [6]:
breast = pd.read_csv(r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Interim\SEER Breast Cancer Dataset .csv")
lung = pd.read_csv(r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Interim\survey lung cancer.csv")

breast.info()
lung.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     4024 non-null   int64  
 1   Race                    4024 non-null   object 
 2   Marital Status          4024 non-null   object 
 3   Unnamed: 3              0 non-null      float64
 4   T Stage                 4024 non-null   object 
 5   N Stage                 4024 non-null   object 
 6   6th Stage               4024 non-null   object 
 7   Grade                   4024 non-null   object 
 8   A Stage                 4024 non-null   object 
 9   Tumor Size              4024 non-null   int64  
 10  Estrogen Status         4024 non-null   object 
 11  Progesterone Status     4024 non-null   object 
 12  Regional Node Examined  4024 non-null   int64  
 13  Reginol Node Positive   4024 non-null   int64  
 14  Survival Months         4024 non-null   

In [7]:
breast.columns

Index(['Age', 'Race ', 'Marital Status', 'Unnamed: 3', 'T Stage ', 'N Stage',
       '6th Stage', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Survival Months', 'Status'],
      dtype='object')

In [8]:
lung.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [9]:
lung = lung.rename(columns={
    "AGE": "Age",
    "GENDER": "Gender"
})

lung["cancer_type"] = "Lung"


In [10]:
import numpy as np

# Score symptomes
symptoms_cols = [
    "WHEEZING", "COUGHING", "SHORTNESS OF BREATH",
    "CHEST PAIN", "SWALLOWING DIFFICULTY"
]

lung["symptom_score"] = lung[symptoms_cols].sum(axis=1)

# Définition stade
conditions = [
    lung["symptom_score"] <= 1,
    lung["symptom_score"] == 2,
    lung["symptom_score"] == 3,
    lung["symptom_score"] >= 4
]

choices = ["I", "II", "III", "IV"]

lung["stage"] = np.select(conditions, choices)

In [11]:
def simulate_survival(stage):
    if stage == "I":
        return np.random.normal(60, 10)
    elif stage == "II":
        return np.random.normal(36, 8)
    elif stage == "III":
        return np.random.normal(18, 6)
    else:
        return np.random.normal(8, 3)

lung["survival_months"] = lung["stage"].apply(simulate_survival)

lung["survival_months"] = lung["survival_months"].clip(lower=1)

In [12]:
lung["event"] = (lung["survival_months"] < 36).astype(int)

In [13]:
lung["patient_id"] = range(1, len(lung)+1)

In [15]:
lung_clean = lung[[
    "patient_id",
    "cancer_type",
    "Age",
    "Gender",
    "stage",
    "survival_months",
    "event"
]]
lung_clean.to_csv(r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Processed\lung_cancer_clean.csv", index=False)

In [16]:
import sys
!{sys.executable} -m pip install -U sdv

Collecting sdv
  Downloading sdv-1.34.0-py3-none-any.whl (200 kB)
Installing collected packages: sdv
  Attempting uninstall: sdv
    Found existing installation: sdv 1.33.1
    Uninstalling sdv-1.33.1:
      Successfully uninstalled sdv-1.33.1
Successfully installed sdv-1.34.0


In [None]:
import os
import numpy as np
import pandas as pd

from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

# 1) Charger le dataset réel (CSV conseillé)
path = r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Interim\lung_cancer.xls"
lung_real = pd.read_csv(path)

# 2) Nettoyage minimal
lung_real = lung_real.dropna(axis=1, how="all").copy()
lung_real.columns = [c.strip().replace(" ", "_") for c in lung_real.columns]

# 3) Identifier colonnes clés 
# time (survie)
time_col = next((c for c in lung_real.columns if c.lower() in ["time", "survival_time", "survival_months", "survtime"]), None)
# event/status (décès vs censuré)
event_col = next((c for c in lung_real.columns if c.lower() in ["status", "event", "cens", "censored", "death"]), None)

print("time_col =", time_col)
print("event_col =", event_col)

# 4) Metadata auto
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(lung_real)

# Forcer event/sex en catégoriel si détecté en numérique 
for c in lung_real.columns:
    if c.lower() in ["sex", "gender", "status", "event", "cens", "censored", "lung_cancer"]:
        metadata.update_column(c, sdtype="categorical")

# 5) Fit SDV (sans constraints)
synth = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True
)

synth.fit(lung_real)

# 6) Générer 1000 lignes synthétiques
lung_synth = synth.sample(num_rows=1000)

# 7) Post-traitement contraintes (safe)
if time_col and time_col in lung_synth.columns:
    lung_synth[time_col] = pd.to_numeric(lung_synth[time_col], errors="coerce").fillna(1)
    lung_synth[time_col] = lung_synth[time_col].round().clip(lower=1)

if event_col and event_col in lung_synth.columns:
    # Si event doit être {0,1}
    lung_synth[event_col] = pd.to_numeric(lung_synth[event_col], errors="coerce")
    # si déjà 0/1 ok, sinon on “binarise”
    lung_synth[event_col] = (lung_synth[event_col].fillna(0) >= 1).astype(int)

# 8) Marquer synth vs réel + concat (optionnel)
lung_real["is_synthetic"] = 0
lung_synth["is_synthetic"] = 1
lung_all = pd.concat([lung_real, lung_synth], ignore_index=True)

# 9) Export
out_path = r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Processed\ncctg_lung_1000_sdv.csv"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
lung_all.to_csv(out_path, index=False)

print("Saved:", out_path, "shape:", lung_all.shape)
lung_all.head()


time_col = TIME
event_col = None



The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



Saved: C:\Projet_filrouge\oncobio_decision_analytics\Data\Processed\ncctg_lung_1000_sdv.csv shape: (1456, 9)


Unnamed: 0,ID,TIME,Y,age,sex,ecog,karnoPH,karnoPAT,is_synthetic
0,1,0,0,74,M,1,90,100,0
1,1,306,1,74,M,1,90,100,0
2,2,0,0,68,M,0,90,90,0
3,2,455,1,68,M,0,90,90,0
4,3,0,0,56,M,0,90,90,0


In [8]:
# Compare distributions sur quelques colonnes
num_cols = lung_real.select_dtypes(include=[np.number]).columns.tolist()

comp = pd.DataFrame({
    "real_mean": lung_real[num_cols].mean(numeric_only=True),
    "synth_mean": lung_synth[num_cols].mean(numeric_only=True),
    "real_median": lung_real[num_cols].median(numeric_only=True),
    "synth_median": lung_synth[num_cols].median(numeric_only=True)
}).round(2)

comp

Unnamed: 0,real_mean,synth_mean,real_median,synth_median
ID,114.5,114.48,114.5,114.0
TIME,152.62,62.54,2.5,5.0
Y,0.36,0.41,0.0,0.0
age,62.45,62.72,63.0,63.0
ecog,0.95,1.02,1.0,1.0
karnoPH,81.93,81.47,80.0,80.0
karnoPAT,79.96,78.89,80.0,80.0
is_synthetic,0.0,1.0,0.0,1.0


In [9]:
from sdv.evaluation.single_table import evaluate_quality

report = evaluate_quality(
    real_data=lung_real.drop(columns=["is_synthetic"], errors="ignore"),
    synthetic_data=lung_synth.drop(columns=["is_synthetic"], errors="ignore"),
    metadata=metadata
)
report.get_score()

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 8/8 [00:00<00:00, 143.41it/s]|
Column Shapes Score: 89.42%

(2/2) Evaluating Column Pair Trends: |██████████| 28/28 [00:00<00:00, 107.96it/s]|
Column Pair Trends Score: 70.5%

Overall Score (Average): 79.96%



0.7996080827067669

In [10]:
print(lung_real.columns.tolist())
print(lung_real.head(3))

['ID', 'TIME', 'Y', 'age', 'sex', 'ecog', 'karnoPH', 'karnoPAT', 'is_synthetic']
   ID  TIME  Y  age sex  ecog  karnoPH  karnoPAT  is_synthetic
0   1     0  0   74   M     1       90       100             0
1   1   306  1   74   M     1       90       100             0
2   2     0  0   68   M     0       90        90             0
