# **Data Processing**

# Import Library

In [1]:
import pandas as pd

# Load the Dataset

In [2]:
# DataFrame with data about medications and side effects
df_medications = pd.read_csv('../data/raw/medicine-dataset.csv', low_memory=False)

# Print the original DataFrame
df_medications

Unnamed: 0,id,name,substitute0,substitute1,substitute2,substitute3,substitute4,sideEffect0,sideEffect1,sideEffect2,...,sideEffect41,use0,use1,use2,use3,use4,Chemical Class,Habit Forming,Therapeutic Class,Action Class
0,1,augmentin 625 duo tablet,Penciclav 500 mg/125 mg Tablet,Moxikind-CV 625 Tablet,Moxiforce-CV 625 Tablet,Fightox 625 Tablet,Novamox CV 625mg Tablet,Vomiting,Nausea,Diarrhea,...,,Treatment of Bacterial infections,,,,,,No,ANTI INFECTIVES,
1,2,azithral 500 tablet,Zithrocare 500mg Tablet,Azax 500 Tablet,Zady 500 Tablet,Cazithro 500mg Tablet,Trulimax 500mg Tablet,Vomiting,Nausea,Abdominal pain,...,,Treatment of Bacterial infections,,,,,Macrolides,No,ANTI INFECTIVES,Macrolides
2,3,ascoril ls syrup,Solvin LS Syrup,Ambrodil-LX Syrup,Zerotuss XP Syrup,Capex LS Syrup,Broxum LS Syrup,Nausea,Vomiting,Diarrhea,...,,Treatment of Cough with mucus,,,,,,No,RESPIRATORY,
3,4,allegra 120mg tablet,Lcfex Tablet,Etofex 120mg Tablet,Nexofex 120mg Tablet,Fexise 120mg Tablet,Histafree 120 Tablet,Headache,Drowsiness,Dizziness,...,,Treatment of Sneezing and runny nose due to al...,Treatment of Allergic conditions,,,,Diphenylmethane Derivative,No,RESPIRATORY,H1 Antihistaminics (second Generation)
4,5,avil 25 tablet,Eralet 25mg Tablet,,,,,Sleepiness,Dryness in mouth,,...,,Treatment of Allergic conditions,,,,,Pyridines Derivatives,No,RESPIRATORY,H1 Antihistaminics (First Generation)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248213,248214,zestrain 100mg/325mg/250mg tablet mr,Aclotec MR 100mg/325mg/250mg Tablet,SAMONEC MR TABLET,Uniclof 100mg/325mg/250mg Tablet MR,Macito 100mg/325mg/250mg Tablet MR,Rumatin 100mg/325mg/250mg Tablet MR,Nausea,Vomiting,Heartburn,...,,Treatment of Muscular pain,,,,,,No,PAIN ANALGESICS,
248214,248215,zoxinace 200mg tablet sr,Algic 200mg Tablet SR,Topnac 200mg Tablet SR,Kindac 200mg Tablet SR,Ultranac 200mg Tablet SR,Bignac 200mg Tablet SR,Dizziness,Indigestion,Nausea,...,,Pain relief,,,,,Dichlorobenzenes Derivative,No,PAIN ANALGESICS,NSAID's- Non-Selective COX 1&2 Inhibitors (ace...
248215,248216,zivex 25mg tablet,HD Zine 25mg Tablet,Hydrocas 25mg Tablet,Hyzox 25 Tablet,Hydil 25mg Tablet,Zyzine 25mg Tablet,Sedation,Nausea,Vomiting,...,,Treatment of Anxiety,Treatment of Skin conditions with inflammation...,,,,Piperazine Derivative,No,RESPIRATORY,H1 Antihistaminics (First Generation)
248216,248217,zi fast 500mg injection,Zycin 500mg Injection,Aziwok 500mg Injection,Azirow 500mg Injection,Toracin 500mg Injection,Azymed 500mg Injection,"Injection site reactions (pain, swelling, redn...",Vomiting,Nausea,...,,Treatment of Bacterial infections,,,,,Macrolides,No,ANTI INFECTIVES,Macrolides


# Processing Steps

## Standardize Column Names and Values

In [3]:
# Standardize column names
renamed_cols = dict()
for col in df_medications.columns:
    renamed_cols[col] = col.lower().replace(' ', '_')

# Rename columns
df_medications = df_medications.rename(columns=renamed_cols)

# Convert columns values to lowercase
df_medications = df_medications \
    .map(lambda col: col.lower() if isinstance(col, str) else col)

## Remove Original ID and Duplicated Rows

In [4]:
# Remove the id column and duplicated rows
df_medications = df_medications.drop(columns=['id']).drop_duplicates()

## Set Medications (`name`) as Primary Key

In [5]:
# Columns of interest for the processed DataFrame
cols_of_interest = [
    col for col in df_medications.columns \
    if (not col.startswith('substitute') and col != 'name')
]

# Columns to iterate over (substitute medications)
cols_to_iterate = [
    col for col in df_medications.columns if col.startswith('substitute')
]

# Create a DataFrame to store the processed data
df_processed_medications = df_medications.copy() \
    .drop(columns=cols_to_iterate)

# Iterate over the substitute columns
for col in cols_to_iterate:
    # Cut the DataFrame according to the column in question
    cols_to_keep = [col] + cols_of_interest
    df_cut = df_medications[cols_to_keep].copy() \
        .rename(columns={col: 'name'}) \
        .dropna(subset='name') \
        .drop_duplicates()
    
    # Concat the data in the processed DataFrame
    df_processed_medications = pd.concat(
        objs=[df_processed_medications, df_cut],
        ignore_index=True,
    )

# Remove duplicated rows
df_processed_medications = df_processed_medications.drop_duplicates()

## Concatenate Side Effects and Uses into Lists

In [6]:
# Columns to concatenate
cols_to_concatenate = {
    'sideeffect': 'side_effects',
    'use': 'uses',
}

# Iterate over the columns to concatenate
for col_prefix in cols_to_concatenate.keys():
    # Columns to be concatenated
    old_cols = [
        col for col in df_processed_medications.columns \
        if col.startswith(col_prefix)
    ]

    # Combine the column values of each row into one list
    col_new_name = cols_to_concatenate[col_prefix]
    df_processed_medications[col_new_name] = df_processed_medications \
        [old_cols] \
        .apply(lambda row: sorted(row.dropna().astype(str).tolist()), axis=1)
    
    # Remove the original columns
    df_processed_medications = df_processed_medications.drop(columns=old_cols)

## Remove Duplicated Data for Medications

In [7]:
# Explode the side effects list
df_processed_medications = df_processed_medications \
    .explode(column='side_effects')

# Explode the uses list
df_processed_medications = df_processed_medications \
    .explode(column='uses')

# Remove duplicated rows
df_processed_medications = df_processed_medications.drop_duplicates()

# Concatenate all column values for each medication
df_processed_medications = df_processed_medications \
    .groupby('name') \
    .agg(
        side_effects = pd.NamedAgg(column='side_effects', aggfunc=set),
        uses = pd.NamedAgg(column='uses', aggfunc=set),
        chemical_class = pd.NamedAgg(column='chemical_class', aggfunc=set),
        habit_forming = pd.NamedAgg(column='habit_forming', aggfunc=set),
        therapeutic_class = pd.NamedAgg(column='therapeutic_class', aggfunc=set),
        action_class = pd.NamedAgg(column='action_class', aggfunc=set),
    ) \
    .reset_index()

# Processed Dataset

In [8]:
# Store the processed dataset
df_processed_medications.to_csv('../data/processed/medicine-dataset.csv', index=False)

In [9]:
# Print the processed dataset
df_processed_medications

Unnamed: 0,name,side_effects,uses,chemical_class,habit_forming,therapeutic_class,action_class
0,1 al plus 5mg/120mg capsule,"{sleepiness, restlessness, headache, dryness i...",{ sneezing and runny nose due to allergies},{nan},{no},{respiratory},{nan}
1,1 nvp tablet,"{sleepiness, constipation, dryness in mouth, f...",{treatment of nausea and vomiting in pregnancy},{nan},{no},{gastro intestinal},{nan}
2,1-al 10 tablet,"{sleepiness, headache, dryness in mouth, nasop...",{treatment of allergic conditions},{piperazine derivatives},{no},{respiratory},{h1 antihistaminics (second generation)}
3,1-al m syrup,"{sleepiness, skin rash, rash, headache, diarrh...",{treatment of sneezing and runny nose due to a...,{nan},{no},{respiratory},{nan}
4,1-al syrup,"{sleepiness, constipation, headache, dryness i...",{treatment of allergic conditions},{piperazine derivatives},{no},{respiratory},{h1 antihistaminics (second generation)}
...,...,...,...,...,...,...,...
230601,zyxtil 500mg tablet,"{rash, diarrhea, allergic reaction, increased ...",{treatment of bacterial infections},{intermediate spectrum {second generation ceph...,{no},{anti infectives},{cephalosporins: 2nd generation}
230602,zyzer syrup,"{sleepiness, constipation, drowsiness, blurred...",{ appetite stimulant},{nan},{no},{vitamins minerals nutrients},{nan}
230603,zyzine 25mg tablet,"{constipation, upset stomach, nausea, vomiting...","{treatment of anxiety, treatment of skin condi...",{piperazine derivative},{no},{respiratory},{h1 antihistaminics (first generation)}
230604,zyzolide 600mg tablet,"{decreased blood cells (red cells, white cells...",{treatment of severe bacterial infections},{oxazolidinone derivative},{no},{anti infectives},{oxazolidinone}
