In [14]:
import pandas as pd

# Load the data
data_df = pd.read_csv(r'C:\rum-dashboard\notebook\data\data.csv')
eml_df = pd.read_csv(r'C:\rum-dashboard\notebook\data\EML_data.csv')
antibiotics_df = pd.read_csv(r'C:\rum-dashboard\notebook\data\antibiotics_data.csv')

# Display the first few rows of each dataset
print("Patient Data:")
print(data_df.head())

print("\nEssential Medicines List (EML):")
print(eml_df.head())

print("\nAntibiotics Data:")
print(antibiotics_df.head())

Patient Data:
   Sr.No. Schedule Date    NHIA No.     Patient No.           Patient Name  \
0       1    07-01-2024  22089700.0  ER-A27-AAB9856   JOHN  BAPTIST  ASARE   
1       2    07-01-2024         NaN  ER-A27-AAB9857  JOSEPH TETTEH TURKSON   
2       3    07-01-2024         NaN  ER-A27-AAB9857  JOSEPH TETTEH TURKSON   
3       4    07-01-2024  46853023.0  ER-A27-AAB9858           VERA   YAABA   
4       5    07-01-2024  46853023.0  ER-A27-AAB9858           VERA   YAABA   

                      Locality Contact No.         Age  Gender     Modality  \
0  ABURI-NEAR METHODIST SCHOOL  0243864670   6 Year(s)    Male          NaN   
1                 ABURI-AW17-4  0242974440  30 Year(s)    Male  GENERAL OPD   
2                 ABURI-AW17-4  0242974440  30 Year(s)    Male          NaN   
3   AKROPONG-NEAR ROYAL SCHOOL  0551876146  30 Year(s)  Female  GENERAL OPD   
4   AKROPONG-NEAR ROYAL SCHOOL  0551876146  30 Year(s)  Female          NaN   

   ... Unnamed: 42 Unnamed: 43 Unnamed: 44

In [15]:
# Step 2.1: Remove Unnecessary Columns
# Drop columns that are completely empty or irrelevant
data_df = data_df.dropna(axis=1, how='all')
antibiotics_df = antibiotics_df.dropna(axis=1, how='all')

# Step 2.2: Handle Missing Values
# Filling or dropping missing values based on context
# For demonstration, we'll fill NA in categorical data with 'Unknown' and numeric data with 0
data_df.fillna({'Modality': 'Unknown', 'NHIA No.': 'Unknown'}, inplace=True)
data_df.fillna(0, inplace=True)  # Replace missing numerical values with 0

# Step 2.3: Normalize Data Formats
# Example: Standardize gender column
data_df['Gender'] = data_df['Gender'].str.capitalize()  # Ensures consistency (e.g., 'Male', 'Female')

# Ensure date format is consistent
data_df['Schedule Date'] = pd.to_datetime(data_df['Schedule Date'], format='%d-%m-%Y')

# Step 2.4: Prepare Data for Integration
# Standardizing medicine names might involve trimming whitespace and converting to lowercase
eml_df['Name of Drug'] = eml_df['Name of Drug'].str.strip().str.lower()
data_df['Medicine Prescribed'] = data_df['Medicine Prescribed'].str.strip().str.lower()


# Let's check the cleaned data
print("Cleaned Patient Data:")
print(data_df.head())

print("\nCleaned Essential Medicines List (EML):")
print(eml_df.head())

print("\nCleaned Antibiotics Data:")
print(antibiotics_df.head())


Cleaned Patient Data:
   Sr.No. Schedule Date    NHIA No.     Patient No.           Patient Name  \
0       1    2024-01-07  22089700.0  ER-A27-AAB9856   JOHN  BAPTIST  ASARE   
1       2    2024-01-07     Unknown  ER-A27-AAB9857  JOSEPH TETTEH TURKSON   
2       3    2024-01-07     Unknown  ER-A27-AAB9857  JOSEPH TETTEH TURKSON   
3       4    2024-01-07  46853023.0  ER-A27-AAB9858           VERA   YAABA   
4       5    2024-01-07  46853023.0  ER-A27-AAB9858           VERA   YAABA   

                      Locality Contact No.         Age  Gender     Modality  \
0  ABURI-NEAR METHODIST SCHOOL  0243864670   6 Year(s)    Male      Unknown   
1                 ABURI-AW17-4  0242974440  30 Year(s)    Male  GENERAL OPD   
2                 ABURI-AW17-4  0242974440  30 Year(s)    Male      Unknown   
3   AKROPONG-NEAR ROYAL SCHOOL  0551876146  30 Year(s)  Female  GENERAL OPD   
4   AKROPONG-NEAR ROYAL SCHOOL  0551876146  30 Year(s)  Female      Unknown   

   ...                     Princip

## EML Pre-processing
-In eml_df,combine the Name of Drug and the Formulation from their respective columns into a Compound string contain the name of the drug and it's formulation and store the output in a newly created column called eml_drug_name_and_formulation


In [20]:
# Function to combine the Name of Drug and Formulation into a compound string
def create_compound_name(drug_name, formulation):
    # Handle non-string types (e.g., NaN values) by converting them to empty strings
    if isinstance(drug_name, str):
        drug_name = drug_name.strip().lower()
    else:
        drug_name = ''

    if isinstance(formulation, str):
        formulation = formulation.strip().lower()
    else:
        formulation = ''

    return f"{drug_name} {formulation}".strip()

# Apply the function to create the 'eml_drug_name_and_formulation' column
eml_df['eml_drug_name_and_formulation'] = eml_df.apply(
    lambda row: create_compound_name(row['Name of Drug'], row['Formulation']), axis=1
)

# Optional: Display the first few rows to verify
print(eml_df[['Name of Drug', 'Formulation', 'eml_drug_name_and_formulation']].head())

# Save the processed EML data to a new CSV file
eml_df.to_csv(r'C:\rum-dashboard\notebook\data\processed_EML_data.csv', index=False)


  Name of Drug Formulation eml_drug_name_and_formulation
0   adrenaline   Injection          adrenaline injection
1   adrenaline   Injection          adrenaline injection
2     atropine   Injection            atropine injection
3     atropine   Injection            atropine injection
4  bupivacaine   Injection         bupivacaine injection


## Preparing data_df for analysis.

###Pseudo code
1 - Extract Name of drug, and formulation from the Medicines prescribed column
2 - For each name of drug and  formulation extracted, join the two strings to form a compound name made up of the name of the prescribed drug and it's formulation.
3 - Some patients have multiple drugs prescribed per encounter. Each drug entry is separated by a comma. Extract all the drugs prescribed per patient according to points 1 and 2 and store the resulting list of drugs extracted as a list of medications separated by a comma in a newly created column called "extracted drugs. This will serve as a list of prescribed medications for each patient which will be used in our analysis later.

note that Each drug entry in Medicines prescribed column is in the format below:

"Name of drug [ Name of drug  | Strength | Formulation | Frequency | Duration  ]

write the code to help me do the above

In [21]:
import re
import pandas as pd

# Function to extract the name of the drug and its formulation
def extract_drug_and_formulation(prescription):
    match = re.match(r"(.*?)\s*\[\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(.*?)\s*\|", prescription)
    if match:
        # Extract drug name from group(1)
        drug_name_group1 = match.group(1).strip().lower()
        
        # If drug name in group(1) is "giving set", skip this entry
        if "giving set" in drug_name_group1:
            return None
        
        # Proceed to extract drug name from group(2) and formulation from group(4)
        drug_name = match.group(2).strip().lower()
        formulation = match.group(4).strip().lower()
        
        return f"{drug_name} {formulation}"
    return None

# Function to process the entire 'Medicines Prescribed' column for each patient
def process_prescriptions(prescriptions):
    if pd.isna(prescriptions):
        return ''
    
    # Split the prescriptions by comma to get individual drugs
    extracted_drugs_with_formulation = []
    for prescription in prescriptions.split(','):
        compound_name = extract_drug_and_formulation(prescription)
        if compound_name:
            extracted_drugs_with_formulation.append(compound_name)
    
    # Join the extracted drug names into a single string separated by commas
    return ', '.join(extracted_drugs_with_formulation)

# Apply the function to create the 'extracted_drugs' column
data_df['extracted_drugs_with_formulation'] = data_df['Medicine Prescribed'].apply(process_prescriptions)

# Save the processed DataFrame to a CSV for inspection
output_file_path = r'C:\rum-dashboard\notebook\data\processed_patient_data.csv'
data_df.to_csv(output_file_path, index=False)

print(f"Data has been processed and saved to {output_file_path}")


Data has been processed and saved to C:\rum-dashboard\notebook\data\processed_patient_data.csv


## Preprocessing Antibiotics Data

In [22]:
# Load the data from the uploaded file
file_path = r'C:\rum-dashboard\notebook\data\antibiotics_data.csv'
antibiotics_df = pd.read_csv(file_path)

# Convert the 'Antibiotic' column to lowercase and remove duplicates
antibiotics_df['Antibiotic'] = antibiotics_df['Antibiotic'].str.lower()
processed_antibiotics_df = antibiotics_df.drop_duplicates(subset=['Antibiotic'])

# Save the processed data to a new CSV file
output_file_path = r'C:\rum-dashboard\notebook\data\processed_antibiotics_data.csv'
processed_antibiotics_df.to_csv(output_file_path, index=False)

output_file_path


'C:\\rum-dashboard\\notebook\\data\\processed_antibiotics_data.csv'

## Extracting encounter statistics into new columns

## total no of drugs prescribed pseudocode
- count the number of individual drug entries separated by commas in extracted_drugs column and store the output in a newly created column named "total_no_of_drugs_prescribed"

In [23]:
#Add total number of drugs prescribed column

# Load the processed patient data
file_path = r'C:\rum-dashboard\notebook\data\processed_patient_data.csv'
data_df = pd.read_csv(file_path)

# Function to count the number of drugs prescribed in the 'extracted_drugs' column
def count_drugs(drug_list):
    if pd.isna(drug_list) or drug_list.strip() == '':
        return 0
    return len(drug_list.split(','))

# Apply the function to create the 'total_no_of_drugs_prescribed' column
data_df['total_no_of_drugs_prescribed'] = data_df['extracted_drugs_with_formulation'].apply(count_drugs)

data_df

Unnamed: 0,Sr.No.,Schedule Date,NHIA No.,Patient No.,Patient Name,Locality,Contact No.,Age,Gender,Modality,...,Pregnant Patient,NHIA Patient,Medicine Prescribed,Medicine Dispensed,Occupation,Date of Admission,Date of Discharge,Cost of Treatment (GHS ),extracted_drugs_with_formulation,total_no_of_drugs_prescribed
0,1,2024-01-07,22089700.0,ER-A27-AAB9856,JOHN BAPTIST ASARE,ABURI-NEAR METHODIST SCHOOL,0243864670,6 Year(s),Male,Unknown,...,No,Insured,,0,CHILD,07-01-2024,07-01-2024,30.0,,0
1,2,2024-01-07,Unknown,ER-A27-AAB9857,JOSEPH TETTEH TURKSON,ABURI-AW17-4,0242974440,30 Year(s),Male,GENERAL OPD,...,No,Non-Insured,omeprazole [ omeprazole | 40 mg | injection |...,Omeprazole [ Omeprazole | 40 mg | Injection ]...,Driver,07-01-2024,07-01-2024,20.0,"omeprazole injection, paracetamol injection, h...",6
2,3,2024-01-07,Unknown,ER-A27-AAB9857,JOSEPH TETTEH TURKSON,ABURI-AW17-4,0242974440,30 Year(s),Male,Unknown,...,No,Non-Insured,,0,Driver,07-01-2024,07-01-2024,45.0,,0
3,4,2024-01-07,46853023.0,ER-A27-AAB9858,VERA YAABA,AKROPONG-NEAR ROYAL SCHOOL,0551876146,30 Year(s),Female,GENERAL OPD,...,No,Insured,omeprazole [ omeprazole | 40 mg | injection |...,Omeprazole [ Omeprazole | 40 mg | Injection ]...,Trader,07-01-2024,07-01-2024,0.0,"omeprazole injection, paracetamol injection, a...",8
4,5,2024-01-07,46853023.0,ER-A27-AAB9858,VERA YAABA,AKROPONG-NEAR ROYAL SCHOOL,0551876146,30 Year(s),Female,Unknown,...,No,Insured,,0,Trader,07-01-2024,07-01-2024,40.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31212,2901,2024-03-22,46779495.0,ER-A27-AAC6158,EBENEZER ATTA ASIEDU,AKROPONG,0593055028,22 Year(s),Male,GENERAL OPD,...,No,Insured,ciprofloxacin [ ciprofloxacin | 250 mg | tabl...,Ciprofloxacin [ Ciprofloxacin | 250 mg | Tabl...,STUDENT,22-03-2024,22-03-2024,0.0,"ciprofloxacin tablet, paracetamol tablet",2
31213,2902,2024-03-22,46779495.0,ER-A27-AAC6158,EBENEZER ATTA ASIEDU,AKROPONG,0593055028,22 Year(s),Male,Unknown,...,No,Insured,,0,STUDENT,22-03-2024,22-03-2024,40.0,,0
31214,2903,2024-03-22,13740840.0,ER-A27-AAC5278,JEPHTER CHOKPUL DONKOR,MAMPONG,0541044468,31 Year(s),Male,GENERAL OPD,...,No,Insured,,0,Public Servant,22-03-2024,22-03-2024,0.0,,0
31215,2904,2024-03-22,39937965.0,ER-A27-AAB4372,AFUA APREM,AMANFO N40,0593340813,36 Year(s),Female,GENERAL OPD,...,No,Insured,,0,SELF EMPLOYED,22-03-2024,22-03-2024,0.0,,0


## Add total number of drugs prescribed on EML column
Psuedo Code
- For each  drug item in the extracted_drugs_with_formulation column  in data_df,search through the drugs in eml_drug_name_and_formulation column found in eml_df to check if it is  present.
- Count the number of drug items  in extracted_drugs_with_formulation for each row in data_df that are also in eml_drug_name_and_formulation in eml_df and store the value in a newly created column in data_df called "no_of_drugs_on_eml"

In [25]:
import pandas as pd

# Assuming eml_df and data_df have already been loaded
# Example: eml_df = pd.read_csv('path_to_your/eml_data.csv')
# Example: data_df = pd.read_csv('path_to_your/processed_patient_data.csv')

# Step 1: Convert the eml_drug_name_and_formulation column in eml_df into a set for efficient look-up
eml_drug_set = set(eml_df['eml_drug_name_and_formulation'].str.lower().unique())

# Step 2: Define a function to count the number of drugs on the EML
def count_drugs_on_eml(drug_list, eml_set):
    if pd.isna(drug_list) or drug_list.strip() == '':
        return 0, [], []  # No drugs to match
    
    drugs = [drug.strip().lower() for drug in drug_list.split(',')]
    matched = [drug for drug in drugs if drug in eml_set]
    unmatched = [drug for drug in drugs if drug not in eml_set]
    
    return len(matched), matched, unmatched

# Step 3: Apply the function to create the 'no_of_drugs_on_eml' column
data_df[['no_of_drugs_on_eml', 'matched_drugs', 'unmatched_drugs']] = data_df['extracted_drugs_with_formulation'].apply(
    lambda x: pd.Series(count_drugs_on_eml(x, eml_drug_set))
)

# Step 4: Output the list of matched and unmatched drugs
# The lists of matched and unmatched drugs are now in 'matched_drugs' and 'unmatched_drugs' columns in data_df

# Save the processed DataFrame to a CSV for inspection (optional)
output_file_path = r'C:\rum-dashboard\notebook\data\updated_patient_data.csv'
data_df.to_csv(output_file_path, index=False)

print(f"Data has been processed and saved to {output_file_path}")

# Optional: Display the first few rows to verify
print(data_df[['extracted_drugs_with_formulation', 'no_of_drugs_on_eml', 'matched_drugs', 'unmatched_drugs']].head())


Data has been processed and saved to C:\rum-dashboard\notebook\data\updated_patient_data.csv
                    extracted_drugs_with_formulation  no_of_drugs_on_eml  \
0                                                NaN                   0   
1  omeprazole injection, paracetamol injection, h...                   3   
2                                                NaN                   0   
3  omeprazole injection, paracetamol injection, a...                   3   
4                                                NaN                   0   

                                       matched_drugs  \
0                                                 []   
1  [paracetamol injection, omeprazole tablet, par...   
2                                                 []   
3  [paracetamol injection, amlodipine tablet, ome...   
4                                                 []   

                                     unmatched_drugs  
0                                                 []  
1  

## NB
- Create a dictionary pair of unmatched drugs with their proper nomenclature in the EML
- replace all instances of the drug name in question in extracted_drugs_with_formulation in data_df with the corresponding proper nomenclature found in the 
-run the Check for no of drugs on eml again and update the values in no_of_drugs_on_eml in data_df
