In [1]:
import pandas as pd
import camelot

## Initializing Antibiotics Dataframe


In [2]:
# Step 1: Read the CSV file into a DataFrame
csv_file_path = 'data/antibiotics.csv'

# Try reading with UTF-8 encoding first
antibiotics_df = pd.read_csv(csv_file_path)
   
# Step 3: Remove empty columns from the DataFrame
antibiotics_df = antibiotics_df.dropna(axis=1, how='all')

# Remove unnamed columns
antibiotics_df = antibiotics_df.loc[:, ~antibiotics_df.columns.str.contains('^Unnamed')]

# sanitize column names to make them suitable for the SQL database
antibiotics_df = antibiotics_df.rename(columns={
    "ATC code": "ATC_code",
    "Listed on EML/EMLc 2023": "Listed_on_EML_EMLc_2023"
})



# Validate data (check for null values)
if antibiotics_df.isnull().values.any():
    raise ValueError("CSV file contains null values. Please clean the data before inserting.")
else:
    print('Dataframe cleaned')



Dataframe cleaned


In [3]:
antibiotics_df

Unnamed: 0,Antibiotic,Class,ATC_code,Category,Listed_on_EML_EMLc_2023
0,Amikacin,Aminoglycosides,J01GB06,Access,Yes
1,Amoxicillin,Penicillins,J01CA04,Access,Yes
2,Amoxicillin/clavulanic-acid,Beta-lactam/beta-lactamase-inhibitor,J01CR02,Access,Yes
3,Ampicillin,Penicillins,J01CA01,Access,Yes
4,Ampicillin/sulbactam,Beta-lactam/beta-lactamase-inhibitor,J01CR01,Access,No
...,...,...,...,...,...
252,Trimethoprim,Trimethoprim-derivatives,J01EA01,Access,Yes
253,Troleandomycin,Macrolides,J01FA08,Watch,No
254,Trovafloxacin,Fluoroquinolones,J01MA13,Watch,No
255,Vancomycin_IV,Glycopeptides,J01XA01,Watch,Yes


## Initializing EML Database

In [16]:
# Use the correct Windows file path
pdf_path = r'C:\rum-dashboard\notebook\data\EML.pdf'  # Update this to your actual file path

# Initialize an empty list to store the tables
tables_list = []

# Extract all tables from the entire document
all_tables = camelot.read_pdf(pdf_path, pages="all", multiple_tables=True)

# Start collecting tables from Table 14 onwards
for i in range(13, len(all_tables)):  # Index starts at 0, so Table 14 is at index 13
    tables_list.append(all_tables[i].df)

# Combine all the selected tables into a single DataFrame
EML_df = pd.concat([pd.DataFrame(table) for table in tables_list], ignore_index=True)

# Display the DataFrame for verification (optional)
print(EML_df.head())

# Save the DataFrame to a CSV file (optional)
EML_df.to_csv(r'C:\rum-dashboard\notebook\data\EML.csv', index=False)


                                                   0          1  \
0  Name of Drug\nFormulation\nStrength\nLevel \nN...              
1                        1.1    General Anaesthetics              
2                           Adrenaline (Epinephrine)  Injection   
3                           Adrenaline (Epinephrine)  Injection   
4                                           Atropine  Injection   

                                  2   3  4 5  
0                                             
1                                             
2               1 mg /ml \n(1:1000)   M  R    
3  100 \nmicrogram/ \nml (1:10 000)   M  R    
4                        0.6 mg/ ml  B2  R    


NB: Final EML_cleaned is further processed manually, validated and saved as EML_data.csv -> Final datasets: 
- antibotics_data.csv
- EML_data.csv(Entered Missingg data manually + Other validations before saving to this)
- data.csv
