### Importation et variables

In [366]:
import polars as pl
from pymongo import MongoClient

In [367]:
FILE_PATH = '.\data\healthcare_dataset.csv'

In [368]:
df = pl.read_csv(FILE_PATH, infer_schema=False)

### Nettoyage et exploration

In [369]:
# Verifie l'absence d'empty string
df.filter(
    pl.any_horizontal(pl.col("*") == "")
)

Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str


In [370]:
df

Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Bobby JacksOn""","""30""","""Male""","""B-""","""Cancer""","""2024-01-31""","""Matthew Smith""","""Sons and Miller""","""Blue Cross""","""18856.281305978155""","""328""","""Urgent""","""2024-02-02""","""Paracetamol""","""Normal"""
"""LesLie TErRy""","""62""","""Male""","""A+""","""Obesity""","""2019-08-20""","""Samantha Davies""","""Kim Inc""","""Medicare""","""33643.327286577885""","""265""","""Emergency""","""2019-08-26""","""Ibuprofen""","""Inconclusive"""
"""DaNnY sMitH""","""76""","""Female""","""A-""","""Obesity""","""2022-09-22""","""Tiffany Mitchell""","""Cook PLC""","""Aetna""","""27955.096078842456""","""205""","""Emergency""","""2022-10-07""","""Aspirin""","""Normal"""
"""andrEw waTtS""","""28""","""Female""","""O+""","""Diabetes""","""2020-11-18""","""Kevin Wells""","""Hernandez Rogers and Vang,""","""Medicare""","""37909.78240987528""","""450""","""Elective""","""2020-12-18""","""Ibuprofen""","""Abnormal"""
"""adrIENNE bEll""","""43""","""Female""","""AB+""","""Cancer""","""2022-09-19""","""Kathleen Hanna""","""White-White""","""Aetna""","""14238.317813937623""","""458""","""Urgent""","""2022-10-09""","""Penicillin""","""Abnormal"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""eLIZABeTH jaCkSOn""","""42""","""Female""","""O+""","""Asthma""","""2020-08-16""","""Joshua Jarvis""","""Jones-Thompson""","""Blue Cross""","""2650.7149523295766""","""417""","""Elective""","""2020-09-15""","""Penicillin""","""Abnormal"""
"""KYle pEREz""","""61""","""Female""","""AB-""","""Obesity""","""2020-01-23""","""Taylor Sullivan""","""Tucker-Moyer""","""Cigna""","""31457.797306905242""","""316""","""Elective""","""2020-02-01""","""Aspirin""","""Normal"""
"""HEATher WaNG""","""38""","""Female""","""B+""","""Hypertension""","""2020-07-13""","""Joe Jacobs DVM""","""and Mahoney Johnson Vasquez,""","""UnitedHealthcare""","""27620.764717427424""","""347""","""Urgent""","""2020-08-10""","""Ibuprofen""","""Abnormal"""
"""JENniFER JOneS""","""43""","""Male""","""O-""","""Arthritis""","""2019-05-25""","""Kimberly Curry""","""Jackson Todd and Castro,""","""Medicare""","""32451.092358488677""","""321""","""Elective""","""2019-05-31""","""Ibuprofen""","""Abnormal"""


In [371]:
# Standardisation du nom des colonnes
df = df.rename({
    col : col.lower().replace(' ','_') for col in df.columns
})

In [372]:
# Changement de la case de la colonne name qui est chaotique
df = (
    df.with_columns(
        pl.col('name')
        .str.to_lowercase()
        .str.to_titlecase()
        .alias('name')
    )
)

In [373]:
# Vérificaiton des lignes dupliqués

nb_duplicate = df.is_duplicated().sum()

if nb_duplicate == 0:
    print("No duplicates found in dataframe")
    
else:
    print(f"Duplicates found in dataframe :{nb_duplicate}")
    
    df = df.unique()

Duplicates found in dataframe :1068


In [374]:
df.is_duplicated().sum()

0

In [375]:
# Test pour les types
try:
    df.get_column('age').cast(pl.Int16)
    df.get_column('room_number').cast(pl.Int16)
    
    df.get_column('date_of_admission').str.strptime(pl.Date, "%Y-%m-%d")
    df.get_column('discharge_date').str.strptime(pl.Date, "%Y-%m-%d")

except Exception as e:
    print(f"Error : {e}")

In [376]:
# Test pour les valeurs manquantes

null_count = df.null_count().sum_horizontal().sum()

if null_count != 0:
    raise ValueError(f"Error : Dataset contains {null_count} missing values")

In [377]:
(
    df.group_by(
        pl.col(['name','age','gender','blood_type'])
    )
    .len('count')
    .sort(by='count',descending=True)
    .filter(
        pl.col('count') > 1
    )
)

name,age,gender,blood_type,count
str,str,str,str,u32
"""Eileen Moore""","""34""","""Female""","""B-""",2
"""Scott Martinez""","""65""","""Female""","""B-""",2
"""Matthew Gonzalez""","""44""","""Female""","""B-""",2
"""Christopher Young""","""60""","""Male""","""A-""",2
"""John Smith""","""45""","""Male""","""AB-""",2
…,…,…,…,…
"""Stephanie Garcia""","""74""","""Female""","""AB+""",2
"""James Miller""","""46""","""Male""","""A+""",2
"""Mary Hill""","""37""","""Female""","""O+""",2
"""Angela Brown""","""67""","""Male""","""O+""",2


In [378]:
# Création d'un identifiant unique pour chaque patient avec la fonction hash
df = df.with_columns(
    pl.concat_str(
        pl.col('name'),
        pl.col('age'),
        pl.col('gender'),
        pl.col('blood_type'))
    .hash()
    .cast(pl.String)
    .alias('patient_id')   
    )

In [379]:
# Création d'un identifiant unique pour chaque admission en combinant l'id et la date d'admission
df = df.with_columns(
    pl.concat_str(
        pl.col('patient_id'),
        pl.lit('_'),
        pl.col('date_of_admission')
    )
    .alias('admission_id')
)

In [380]:
df = df.select(
    ['admission_id', 'patient_id'] + [col for col in df.columns if col not in ['admission_id', 'patient_id']]
)

In [381]:
df.select('patient_id').is_duplicated().sum()

44

In [382]:
df

admission_id,patient_id,name,age,gender,blood_type,medical_condition,date_of_admission,doctor,hospital,insurance_provider,billing_amount,room_number,admission_type,discharge_date,medication,test_results
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""8433417931432884664_2021-04-24""","""8433417931432884664""","""Henry Neal""","""82""","""Female""","""O+""","""Hypertension""","""2021-04-24""","""Kelly Davis""","""Group Sandoval""","""Cigna""","""32338.708161436014""","""472""","""Elective""","""2021-05-07""","""Ibuprofen""","""Normal"""
"""7521455188759353624_2020-04-06""","""7521455188759353624""","""Tiffany Malone""","""82""","""Male""","""AB-""","""Asthma""","""2020-04-06""","""Victoria Ramirez""","""Inc Cole""","""UnitedHealthcare""","""4361.503519046726""","""330""","""Urgent""","""2020-04-20""","""Paracetamol""","""Inconclusive"""
"""6428175190370289725_2020-05-30""","""6428175190370289725""","""Curtis Diaz""","""39""","""Female""","""A-""","""Hypertension""","""2020-05-30""","""Connie Hurst""","""and Flores Smith Williams,""","""Cigna""","""3376.178974428809""","""178""","""Elective""","""2020-06-07""","""Paracetamol""","""Inconclusive"""
"""15728787616282845145_2023-04-2…","""15728787616282845145""","""Spencer Acosta""","""48""","""Male""","""O+""","""Diabetes""","""2023-04-25""","""Morgan Wood""","""Marshall Group""","""Cigna""","""49789.41212286746""","""225""","""Elective""","""2023-05-17""","""Lipitor""","""Abnormal"""
"""16814315902648283424_2019-10-1…","""16814315902648283424""","""Andrew Holmes""","""41""","""Male""","""AB+""","""Obesity""","""2019-10-18""","""Robert Taylor""","""Griffin-Hill""","""Blue Cross""","""8965.080233503651""","""279""","""Urgent""","""2019-10-19""","""Ibuprofen""","""Inconclusive"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""12923919152423543318_2023-01-1…","""12923919152423543318""","""Richard Robinson""","""80""","""Female""","""AB+""","""Obesity""","""2023-01-14""","""Monica Parker""","""Frank-Archer""","""Cigna""","""35061.63422557754""","""290""","""Urgent""","""2023-01-30""","""Penicillin""","""Normal"""
"""2413357748097446722_2024-03-07""","""2413357748097446722""","""Timothy Maxwell""","""42""","""Male""","""B-""","""Cancer""","""2024-03-07""","""Fernando Adkins""","""Sons and Medina""","""UnitedHealthcare""","""38231.22230209906""","""215""","""Elective""","""2024-03-20""","""Lipitor""","""Normal"""
"""11421101681965404328_2021-12-1…","""11421101681965404328""","""Christopher Lee Md""","""78""","""Female""","""O+""","""Asthma""","""2021-12-12""","""Philip Duncan""","""Brady-Greene""","""UnitedHealthcare""","""5628.173873356757""","""268""","""Urgent""","""2022-01-03""","""Ibuprofen""","""Abnormal"""
"""15016733377835969274_2022-04-0…","""15016733377835969274""","""Chris Gregory""","""30""","""Male""","""B-""","""Obesity""","""2022-04-02""","""Lisa Spencer""","""Wagner-Young""","""Medicare""","""26869.08463023003""","""448""","""Urgent""","""2022-04-29""","""Paracetamol""","""Abnormal"""


### MongoDB

In [383]:
client = MongoClient("mongodb://localhost:27017/")

In [384]:
client.list_database_names()

['admin', 'config', 'local']

In [385]:
data_dicts = df.to_dicts()

In [386]:
data_dicts

[{'admission_id': '8433417931432884664_2021-04-24',
  'patient_id': '8433417931432884664',
  'name': 'Henry Neal',
  'age': '82',
  'gender': 'Female',
  'blood_type': 'O+',
  'medical_condition': 'Hypertension',
  'date_of_admission': '2021-04-24',
  'doctor': 'Kelly Davis',
  'hospital': 'Group Sandoval',
  'insurance_provider': 'Cigna',
  'billing_amount': '32338.708161436014',
  'room_number': '472',
  'admission_type': 'Elective',
  'discharge_date': '2021-05-07',
  'medication': 'Ibuprofen',
  'test_results': 'Normal'},
 {'admission_id': '7521455188759353624_2020-04-06',
  'patient_id': '7521455188759353624',
  'name': 'Tiffany Malone',
  'age': '82',
  'gender': 'Male',
  'blood_type': 'AB-',
  'medical_condition': 'Asthma',
  'date_of_admission': '2020-04-06',
  'doctor': 'Victoria Ramirez',
  'hospital': 'Inc Cole',
  'insurance_provider': 'UnitedHealthcare',
  'billing_amount': '4361.503519046726',
  'room_number': '330',
  'admission_type': 'Urgent',
  'discharge_date': '202

In [387]:
db = client.test_database

In [388]:
collection = db.test_collection

In [389]:
data_dicts[0]

{'admission_id': '8433417931432884664_2021-04-24',
 'patient_id': '8433417931432884664',
 'name': 'Henry Neal',
 'age': '82',
 'gender': 'Female',
 'blood_type': 'O+',
 'medical_condition': 'Hypertension',
 'date_of_admission': '2021-04-24',
 'doctor': 'Kelly Davis',
 'hospital': 'Group Sandoval',
 'insurance_provider': 'Cigna',
 'billing_amount': '32338.708161436014',
 'room_number': '472',
 'admission_type': 'Elective',
 'discharge_date': '2021-05-07',
 'medication': 'Ibuprofen',
 'test_results': 'Normal'}

In [391]:
collection.insert_many(data_dicts)

InsertManyResult([ObjectId('675db6ae3fc86dd92c08cc34'), ObjectId('675db6ae3fc86dd92c08cc35'), ObjectId('675db6ae3fc86dd92c08cc36'), ObjectId('675db6ae3fc86dd92c08cc37'), ObjectId('675db6ae3fc86dd92c08cc38'), ObjectId('675db6ae3fc86dd92c08cc39'), ObjectId('675db6ae3fc86dd92c08cc3a'), ObjectId('675db6ae3fc86dd92c08cc3b'), ObjectId('675db6ae3fc86dd92c08cc3c'), ObjectId('675db6ae3fc86dd92c08cc3d'), ObjectId('675db6ae3fc86dd92c08cc3e'), ObjectId('675db6ae3fc86dd92c08cc3f'), ObjectId('675db6ae3fc86dd92c08cc40'), ObjectId('675db6ae3fc86dd92c08cc41'), ObjectId('675db6ae3fc86dd92c08cc42'), ObjectId('675db6ae3fc86dd92c08cc43'), ObjectId('675db6ae3fc86dd92c08cc44'), ObjectId('675db6ae3fc86dd92c08cc45'), ObjectId('675db6ae3fc86dd92c08cc46'), ObjectId('675db6ae3fc86dd92c08cc47'), ObjectId('675db6ae3fc86dd92c08cc48'), ObjectId('675db6ae3fc86dd92c08cc49'), ObjectId('675db6ae3fc86dd92c08cc4a'), ObjectId('675db6ae3fc86dd92c08cc4b'), ObjectId('675db6ae3fc86dd92c08cc4c'), ObjectId('675db6ae3fc86dd92c08cc

In [392]:
collection.count_documents({})

54966

In [None]:
db.collection.createIndex({ "admission_id": 1 }, { "unique": true })

In [393]:
db.list_collection_names()

['test_collection']

In [394]:
client.list_database_names()

['admin', 'config', 'local', 'test_database']

In [395]:
result = collection.find({'name':'Jordan Smith'})

In [396]:
for documents in result:
    print(documents)

{'_id': ObjectId('675db6ae3fc86dd92c08d35e'), 'admission_id': '4117191452299311280_2022-07-07', 'patient_id': '4117191452299311280', 'name': 'Jordan Smith', 'age': '76', 'gender': 'Female', 'blood_type': 'B+', 'medical_condition': 'Arthritis', 'date_of_admission': '2022-07-07', 'doctor': 'Peter Ball', 'hospital': 'Smith Ltd', 'insurance_provider': 'Medicare', 'billing_amount': '36459.351659161744', 'room_number': '367', 'admission_type': 'Urgent', 'discharge_date': '2022-07-08', 'medication': 'Penicillin', 'test_results': 'Abnormal'}
{'_id': ObjectId('675db6ae3fc86dd92c08e473'), 'admission_id': '3717347461634749901_2023-12-02', 'patient_id': '3717347461634749901', 'name': 'Jordan Smith', 'age': '24', 'gender': 'Male', 'blood_type': 'A-', 'medical_condition': 'Cancer', 'date_of_admission': '2023-12-02', 'doctor': 'Christopher Lawson', 'hospital': 'Day-Robinson', 'insurance_provider': 'Blue Cross', 'billing_amount': '44294.3809131703', 'room_number': '170', 'admission_type': 'Emergency',

In [397]:
client.drop_database('test_database')