## Importations

In [78]:
import pandas as pd 
import numpy as np
import os
from pathlib import Path
import json

### Configuration des chemins et création du dossier de sortie des datasets

In [79]:
BASE_PATH=Path(r"C:\Users\Infinix\Desktop\Projet fil rouge\data\raw")
OUTPUT_PATH=Path(r"C:\Users\Infinix\Desktop\Projet fil rouge\data\processed")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

In [80]:
print("ÉTAPE 1 : Chargement du dataset principal...")

# Dataset principal avec symptômes
disease_symptom_path = BASE_PATH / "Disease Symptom" / "dataset.csv"

df_main = pd.read_csv(disease_symptom_path)
print(f" Dataset principal chargé: {df_main.shape[0]} lignes, {df_main.shape[1]} colonnes")
print(f"   Maladies uniques: {df_main['Disease'].nunique() }")


ÉTAPE 1 : Chargement du dataset principal...
 Dataset principal chargé: 4920 lignes, 18 colonnes
   Maladies uniques: 41


In [81]:
print("\n Aperçu du dataset principal:")
print(df_main.head(3))


 Aperçu du dataset principal:
            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symptom_14 Symptom_15  \
0        NaN        NaN        NaN        NaN        NaN        NaN   
1        NaN        NaN        NaN        NaN        NaN        NaN   
2        NaN        NaN        NaN        NaN        NaN        NaN   

  Symptom_16 Symptom_17  
0        NaN        NaN 

### Charger les datasets supplémentaires

Dataset des descriptions

In [82]:
try:
    desc_path = BASE_PATH / "Disease Symptom" / "symptom_Description.csv"
    df_descriptions = pd.read_csv(desc_path)
    print(f"Descriptions chargées: {df_descriptions.shape[0]} maladies")
except Exception as e:
    print(f"Descriptions non trouvées: {e}")
    df_descriptions = None

Descriptions chargées: 41 maladies


Dataset des précautions

In [83]:
try:
    prec_path = BASE_PATH / "Disease Symptom" / "symptom_precaution.csv"
    df_precautions = pd.read_csv(prec_path)
    print(f" Précautions chargées: {df_precautions.shape[0]} maladies")
except Exception as e:
    print(f" Précautions non trouvées: {e}")
    df_precautions = None

 Précautions chargées: 41 maladies


Dataset severity

In [84]:
try:
    sev_path = BASE_PATH / "Disease Symptom" / "Symptom-severity.csv"
    df_severity = pd.read_csv(sev_path)
    print(f"Sévérité chargée: {df_severity.shape[0]} symptômes")
except Exception as e:
    print(f"Sévérité non trouvée: {e}")
    df_severity = None

Sévérité chargée: 133 symptômes


In [85]:
print(df_main.columns.tolist())

['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']


## Analyse exploratoire des données


Identifier la colonne de maladie

In [86]:
disease_column = df_main.columns[0]  #Première colonne
print(f"   Colonne maladie: {disease_column}")

   Colonne maladie: Disease


Compter les maladies uniques

In [87]:
unique_diseases=df_main[disease_column].unique()
print(f"Nombre de maladies: {len(unique_diseases)}")

Nombre de maladies: 41


Distribution des maladies

In [88]:
disease_counts = df_main[disease_column].value_counts()
print(f"Top 10 maladies les plus fréquentes:")
for disease, count in disease_counts.head(10).items():
    print(f"   • {disease}: {count} cas")

Top 10 maladies les plus fréquentes:
   • Fungal infection: 120 cas
   • Allergy: 120 cas
   • GERD: 120 cas
   • Chronic cholestasis: 120 cas
   • Drug Reaction: 120 cas
   • Peptic ulcer diseae: 120 cas
   • AIDS: 120 cas
   • Diabetes : 120 cas
   • Gastroenteritis: 120 cas
   • Bronchial Asthma: 120 cas


Analyser les symptomes

In [89]:
symptom_columns = [col for col in df_main.columns if 'Symptom' in col]
print(f"Colonnes de symptômes: {len(symptom_columns)}")
print(f"   {symptom_columns[:5]}... (montrant 5 premières)")

Colonnes de symptômes: 17
   ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5']... (montrant 5 premières)


Nettoyage des données

In [90]:
df_clean=df_main.copy()
df_clean['Disease']=df_clean['Disease'].str.strip()


Remplacer les valeurs manquantes

In [91]:
for col in symptom_columns:
    df_clean[col]=df_clean[col].fillna('')
    df_clean[col]=df_clean[col].str.strip()
print(f"   Lignes après nettoyage: {df_clean.shape[0]}")

   Lignes après nettoyage: 4920


Ajouter un ID patient unique

In [92]:
df_clean['Patient_ID']=np.arange(1, df_clean.shape[0]+1)

Compter le nombre de symptômes par patient

In [93]:
def count_symptoms(row):
  symptoms_cols=[col for col in df_clean.columns if 'Symptom in col']
  count=0
  for col in symptoms_cols:
    if row[col] and str(row[col]).strip():
      count+=1
  return count
df_clean['Symptom_Count'] = df_clean.apply(count_symptoms, axis=1)

Les id des patients se sont ajoutés, et le nombre des symptomes est calculé.

In [94]:
print(f"   Moyenne de symptômes par patient: {df_clean['Symptom_Count'].mean():.2f}")

   Moyenne de symptômes par patient: 9.45


Créer une colonne avec tous les symptomes concaténés.

In [95]:
def concatenate_symptoms(row):
    symptom_cols = [col for col in df_clean.columns if 'Symptom' in col]
    symptoms = []
    for col in symptom_cols:
        if row[col] and str(row[col]).strip():
            symptoms.append(str(row[col]).strip())
    return ', '.join(symptoms)
df_clean['All_Symptoms'] = df_clean.apply(concatenate_symptoms, axis=1)
print("Symptômes concaténés créés")
print(df_clean[['Patient_ID', 'All_Symptoms']].head(3))

Symptômes concaténés créés
   Patient_ID                                       All_Symptoms
0           1  itching, skin_rash, nodal_skin_eruptions, disc...
1           2  skin_rash, nodal_skin_eruptions, dischromic _p...
2           3  itching, nodal_skin_eruptions, dischromic _pat...


### Ajout des métadonnées

Traitement du dataset df_precautions

In [96]:
precaution_cols = [col for col in df_precautions.columns if 'Precaution' in col]
def concatenate_precautions(row):
    precautions = []
    for col in precaution_cols:
        if row[col] and str(row[col]).strip():
            precautions.append(str(row[col]).strip())
    return ', '.join(precautions)
df_precautions['All_Precautions'] = df_precautions.apply(concatenate_precautions, axis=1)
print("Précautions concaténés créés")
print(df_precautions['All_Precautions'].head(3))

Précautions concaténés créés
0    stop irritation, consult nearest hospital, sto...
1    Consult nearest hospital, avoid oily food, avo...
2    apply calamine, cover area with bandage, nan, ...
Name: All_Precautions, dtype: object


Traitement du dataset df_severity

In [97]:
df_severity['Symptom']=df_severity['Symptom'].str.strip()

#### Merger les métadonnées

In [98]:
df_enriched=df_clean.merge(df_descriptions, on='Disease',how='left')
df_enriched=df_enriched.merge(df_precautions[['Disease','All_Precautions']], on='Disease',how='left')

In [100]:
def calculate_severity_score(row):
    symptom_cols = [col for col in df_clean.columns if 'Symptom' in col]
    total_severity = 0
    symptom_count = 0
    
    for col in symptom_cols:
        symptom = row[col]
        if symptom and str(symptom).strip():
            # Chercher la sévérité de ce symptôme
            severity_row = df_severity[df_severity['Symptom'].str.lower() == str(symptom).lower()]
            if not severity_row.empty:
                total_severity += severity_row['weight'].values[0]
                symptom_count += 1
    
    return total_severity / symptom_count if symptom_count > 0 else 0

df_enriched['Severity_Score'] = df_enriched.apply(calculate_severity_score, axis=1)
print("Score de sévérité calculé")
print(f"Dataset enrichi: {df_enriched.shape[0]} lignes, {df_enriched.shape[1]} colonnes")


Score de sévérité calculé
Dataset enrichi: 4920 lignes, 24 colonnes


In [102]:
df_enriched['Severity_Score'].head()

0    2.666667
1    3.500000
2    2.500000
3    2.000000
4    2.666667
Name: Severity_Score, dtype: float64

### Création de différents versions des datasets

Version 1 : Dataset complet enrichi

In [103]:
output_full = OUTPUT_PATH / "medical_dataset_full.csv"
df_enriched.to_csv(output_full, index=False)
print(f"Dataset complet sauvegardé: {output_full}")
print(f"   Taille: {df_enriched.shape}")


Dataset complet sauvegardé: C:\Users\Infinix\Desktop\Projet fil rouge\data\processed\medical_dataset_full.csv
   Taille: (4920, 24)


Version 2 : Dataset pour ML (colonnes essentielles)

In [104]:
ml_columns = ['Patient_ID', 'All_Symptoms', 'Symptom_Count', 
              'Severity_Score', 'Disease']
df_ml = df_enriched[ml_columns].copy()
output_ml = OUTPUT_PATH / "medical_dataset_ml.csv"
df_ml.to_csv(output_ml, index=False)
print(f"Dataset ML sauvegardé: {output_ml}")
print(f"   Taille: {df_ml.shape}")

Dataset ML sauvegardé: C:\Users\Infinix\Desktop\Projet fil rouge\data\processed\medical_dataset_ml.csv
   Taille: (4920, 5)


Version 3 : Dataset avec symptômes individuels (format original nettoyé)

In [105]:
output_symptoms = OUTPUT_PATH / "medical_dataset_symptoms.csv"
symptom_disease_cols = symptom_columns + ['Disease', 'Patient_ID']
df_symptoms = df_enriched[symptom_disease_cols].copy()
df_symptoms.to_csv(output_symptoms, index=False)
print(f"Dataset symptômes sauvegardé: {output_symptoms}")

Dataset symptômes sauvegardé: C:\Users\Infinix\Desktop\Projet fil rouge\data\processed\medical_dataset_symptoms.csv


Version 4 : Métadonnées des maladies

In [106]:
disease_metadata = df_enriched[['Disease', 'Description', 'All_Precautions']].drop_duplicates()
output_metadata = OUTPUT_PATH / "disease_metadata.csv"
disease_metadata.to_csv(output_metadata, index=False)
print(f"Métadonnées maladies sauvegardées: {output_metadata}")

Métadonnées maladies sauvegardées: C:\Users\Infinix\Desktop\Projet fil rouge\data\processed\disease_metadata.csv


In [107]:
disease_metadata.head()

Unnamed: 0,Disease,Description,All_Precautions
0,Fungal infection,"In humans, fungal infections occur when an inv...","bath twice, use detol or neem in bathing water..."
10,Allergy,An allergy is an immune system response to a f...,"apply calamine, cover area with bandage, nan, ..."
20,GERD,"Gastroesophageal reflux disease, or GERD, is a...","avoid fatty spicy food, avoid lying down after..."
30,Chronic cholestasis,"Chronic cholestatic diseases, whether occurrin...","cold baths, anti itch medicine, consult doctor..."
40,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"stop irritation, consult nearest hospital, sto..."


### GÉNÉRER STATISTIQUES

In [109]:
stats = {
    'dataset_info': {
        'total_patients': int(df_enriched.shape[0]),
        'total_diseases': int(df_enriched['Disease'].nunique()),
        'total_unique_symptoms': len(df_severity),
        'avg_symptoms_per_patient': float(df_enriched['Symptom_Count'].mean()),
        'max_symptoms_per_patient': int(df_enriched['Symptom_Count'].max()),
        'min_symptoms_per_patient': int(df_enriched['Symptom_Count'].min()),
        'avg_severity_score': float(df_enriched['Severity_Score'].mean())
    },
    'disease_distribution': df_enriched['Disease'].value_counts().to_dict(),
    'top_10_diseases': df_enriched['Disease'].value_counts().head(10).to_dict(),
    'columns_full_dataset': df_enriched.columns.tolist(),
    'columns_ml_dataset': df_ml.columns.tolist(),
    'data_quality': {
        'missing_values': int(df_enriched.isnull().sum().sum()),
        'duplicate_rows': int(df_enriched.duplicated().sum()),
        'completeness_rate': float((1 - df_enriched.isnull().sum().sum() / 
                                    (df_enriched.shape[0] * df_enriched.shape[1])) * 100)
    }
}
print("Statistiques du dataset:")
print(stats)


Statistiques du dataset:
{'dataset_info': {'total_patients': 4920, 'total_diseases': 41, 'total_unique_symptoms': 133, 'avg_symptoms_per_patient': 9.448780487804878, 'max_symptoms_per_patient': 19, 'min_symptoms_per_patient': 5, 'avg_severity_score': 4.130582342394574}, 'disease_distribution': {'Fungal infection': 120, 'Allergy': 120, 'GERD': 120, 'Chronic cholestasis': 120, 'Drug Reaction': 120, 'Peptic ulcer diseae': 120, 'AIDS': 120, 'Diabetes': 120, 'Gastroenteritis': 120, 'Bronchial Asthma': 120, 'Hypertension': 120, 'Migraine': 120, 'Cervical spondylosis': 120, 'Paralysis (brain hemorrhage)': 120, 'Jaundice': 120, 'Malaria': 120, 'Chicken pox': 120, 'Dengue': 120, 'Typhoid': 120, 'hepatitis A': 120, 'Hepatitis B': 120, 'Hepatitis C': 120, 'Hepatitis D': 120, 'Hepatitis E': 120, 'Alcoholic hepatitis': 120, 'Tuberculosis': 120, 'Common Cold': 120, 'Pneumonia': 120, 'Dimorphic hemmorhoids(piles)': 120, 'Heart attack': 120, 'Varicose veins': 120, 'Hypothyroidism': 120, 'Hyperthyroidi

Sauvegarder les statistiques

In [110]:
stats_file = OUTPUT_PATH / "dataset_statistics.json"
with open(stats_file, 'w', encoding='utf-8') as f:
    json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"Statistiques sauvegardées: {stats_file}")

Statistiques sauvegardées: C:\Users\Infinix\Desktop\Projet fil rouge\data\processed\dataset_statistics.json
