In [None]:
#!/usr/bin/env python3
# test_integrity.py

import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv

def main():
    # 1. Charge le .env et √©crase les vars syst√®me si besoin
    load_dotenv(override=True)
    
    # 2. R√©cup√®re les variables d'environnement
    MONGO_URI       = os.getenv("MONGO_URI")
    DB_NAME         = os.getenv("MONGO_DB")
    COLL_NAME       = os.getenv("MONGO_COLLECTION")
    CSV_PATH        = os.getenv("CSV_PATH", "data/healthcare_dataset.csv")
    
    # 3. Affiche pour v√©rification
    print("üîå Mongo URI     :", MONGO_URI)
    print("üìÅ Database     :", DB_NAME)
    print("üìÇ Collection   :", COLL_NAME)
    print("üìÑ CSV Path     :", CSV_PATH)
    print()
    
    # 4. Connexion MongoDB
    client     = MongoClient(MONGO_URI)
    db         = client[DB_NAME]
    collection = db[COLL_NAME]
    
    # 5. Chargement du CSV
    df_csv     = pd.read_csv(CSV_PATH)
    
    # 6. Chargement des documents MongoDB
    docs       = list(collection.find({}, {"_id": 0}))
    df_mongo   = pd.DataFrame(docs)
    
    # 7. V√©rifications
    print("=== V√âRIFICATIONS D'INT√âGRIT√â ===")
    
    # 7.1 Nombre de lignes
    n_csv   = len(df_csv)
    n_mongo = len(df_mongo)
    print(f"‚Ä¢ Lignes CSV    : {n_csv}")
    print(f"‚Ä¢ Documents DB  : {n_mongo}")
    assert n_csv == n_mongo, f"‚ùå Incoh√©rence count : CSV={n_csv}, Mongo={n_mongo}"
    
    # 7.2 Colonnes
    cols_csv   = set(df_csv.columns)
    cols_mongo = set(df_mongo.columns)
    print(f"‚Ä¢ Colonnes CSV  : {cols_csv}")
    print(f"‚Ä¢ Colonnes DB   : {cols_mongo}")
    assert cols_csv == cols_mongo, f"‚ùå Colonnes diff : {cols_csv ^ cols_mongo}"
    
    # 7.3 Types de donn√©es
    dtypes_csv   = df_csv.dtypes.to_dict()
    dtypes_mongo = df_mongo.dtypes.to_dict()
    print("‚Ä¢ Types CSV     :", dtypes_csv)
    print("‚Ä¢ Types DB      :", dtypes_mongo)
    # (optionnel) v√©rifier que les types pandas sont compatibles
    
    # 7.4 Valeurs manquantes
    na_csv   = df_csv.isnull().sum().to_dict()
    na_mongo = df_mongo.isnull().sum().to_dict()
    print("‚Ä¢ Manquants CSV :", na_csv)
    print("‚Ä¢ Manquants DB  :", na_mongo)
    assert na_csv == na_mongo, "‚ùå Diff√©rence dans les valeurs manquantes"
    
    # 7.5 Doublons
    dup_csv   = int(df_csv.duplicated(keep=False).sum())
    dup_mongo = int(df_mongo.duplicated(keep=False).sum())
    print(f"‚Ä¢ Doublons CSV  : {dup_csv}")
    print(f"‚Ä¢ Doublons DB   : {dup_mongo}")
    assert dup_csv == dup_mongo, f"‚ùå Doublons diff : CSV={dup_csv}, DB={dup_mongo}"
    
    print("\n‚úÖ Tous les tests d‚Äôint√©grit√© sont pass√©s avec succ√®s.")

if __name__ == "__main__":
    main()


AssertionError: Incoh√©rence : CSV = 55500, MongoDB = 444000