# **Data Processing**

# Import Library

In [1]:
import pandas as pd

In [2]:
# DataFrame with data about medications and side effects
df_medications = pd.read_csv('../data/raw/medicine-dataset.csv', low_memory=False)

# Print the original DataFrame
df_medications

Unnamed: 0,id,name,substitute0,substitute1,substitute2,substitute3,substitute4,sideEffect0,sideEffect1,sideEffect2,...,sideEffect41,use0,use1,use2,use3,use4,Chemical Class,Habit Forming,Therapeutic Class,Action Class
0,1,augmentin 625 duo tablet,Penciclav 500 mg/125 mg Tablet,Moxikind-CV 625 Tablet,Moxiforce-CV 625 Tablet,Fightox 625 Tablet,Novamox CV 625mg Tablet,Vomiting,Nausea,Diarrhea,...,,Treatment of Bacterial infections,,,,,,No,ANTI INFECTIVES,
1,2,azithral 500 tablet,Zithrocare 500mg Tablet,Azax 500 Tablet,Zady 500 Tablet,Cazithro 500mg Tablet,Trulimax 500mg Tablet,Vomiting,Nausea,Abdominal pain,...,,Treatment of Bacterial infections,,,,,Macrolides,No,ANTI INFECTIVES,Macrolides
2,3,ascoril ls syrup,Solvin LS Syrup,Ambrodil-LX Syrup,Zerotuss XP Syrup,Capex LS Syrup,Broxum LS Syrup,Nausea,Vomiting,Diarrhea,...,,Treatment of Cough with mucus,,,,,,No,RESPIRATORY,
3,4,allegra 120mg tablet,Lcfex Tablet,Etofex 120mg Tablet,Nexofex 120mg Tablet,Fexise 120mg Tablet,Histafree 120 Tablet,Headache,Drowsiness,Dizziness,...,,Treatment of Sneezing and runny nose due to al...,Treatment of Allergic conditions,,,,Diphenylmethane Derivative,No,RESPIRATORY,H1 Antihistaminics (second Generation)
4,5,avil 25 tablet,Eralet 25mg Tablet,,,,,Sleepiness,Dryness in mouth,,...,,Treatment of Allergic conditions,,,,,Pyridines Derivatives,No,RESPIRATORY,H1 Antihistaminics (First Generation)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248213,248214,zestrain 100mg/325mg/250mg tablet mr,Aclotec MR 100mg/325mg/250mg Tablet,SAMONEC MR TABLET,Uniclof 100mg/325mg/250mg Tablet MR,Macito 100mg/325mg/250mg Tablet MR,Rumatin 100mg/325mg/250mg Tablet MR,Nausea,Vomiting,Heartburn,...,,Treatment of Muscular pain,,,,,,No,PAIN ANALGESICS,
248214,248215,zoxinace 200mg tablet sr,Algic 200mg Tablet SR,Topnac 200mg Tablet SR,Kindac 200mg Tablet SR,Ultranac 200mg Tablet SR,Bignac 200mg Tablet SR,Dizziness,Indigestion,Nausea,...,,Pain relief,,,,,Dichlorobenzenes Derivative,No,PAIN ANALGESICS,NSAID's- Non-Selective COX 1&2 Inhibitors (ace...
248215,248216,zivex 25mg tablet,HD Zine 25mg Tablet,Hydrocas 25mg Tablet,Hyzox 25 Tablet,Hydil 25mg Tablet,Zyzine 25mg Tablet,Sedation,Nausea,Vomiting,...,,Treatment of Anxiety,Treatment of Skin conditions with inflammation...,,,,Piperazine Derivative,No,RESPIRATORY,H1 Antihistaminics (First Generation)
248216,248217,zi fast 500mg injection,Zycin 500mg Injection,Aziwok 500mg Injection,Azirow 500mg Injection,Toracin 500mg Injection,Azymed 500mg Injection,"Injection site reactions (pain, swelling, redn...",Vomiting,Nausea,...,,Treatment of Bacterial infections,,,,,Macrolides,No,ANTI INFECTIVES,Macrolides


In [3]:
# Standardize column names
renamed_cols = dict()
for col in df_medications.columns:
    renamed_cols[col] = col.lower().replace(' ', '_')

# Convert columns values to lowercase
df_medications = df_medications \
    .applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Remove the id column, rename columns, and remove duplicated rows
df_medications = df_medications \
    .drop(columns=['id']) \
    .rename(columns=renamed_cols) \
    .drop_duplicates()

In [4]:
# Columns of interest for the processed DataFrame
cols_of_interest = [
    col for col in df_medications.columns \
    if (not col.startswith('substitute') and col != 'name')
]

# Columns to iterate over (substitute medications)
cols_to_iterate = [
    col for col in df_medications.columns if col.startswith('substitute')
]

# Create a DataFrame to store the processed data
df_processed_medications = df_medications.copy() \
    .drop(columns=cols_to_iterate)

# Iterate over the substitute columns
for col in cols_to_iterate:
    # Cut the DataFrame according to the column in question
    cols_to_keep = [col] + cols_of_interest
    df_cut = df_medications[cols_to_keep].copy() \
        .rename(columns={col: 'name'}) \
        .dropna(subset='name') \
        .drop_duplicates()
    
    # Concat the data in the processed DataFrame
    df_processed_medications = pd.concat(
        objs=[df_processed_medications, df_cut],
        ignore_index=True
    )

# Remove duplicated rows
df_processed_medications = df_processed_medications.drop_duplicates()

In [5]:
# Combine the side effects of each row into one list
effect_cols = [col for col in df_processed_medications.columns if col.startswith('sideeffect')]
df_processed_medications['side_effects'] = df_processed_medications[effect_cols] \
    .apply(
        lambda row: sorted(
            [x for x in row.dropna().tolist() if str(x).strip() != '']
        ),
        axis=1
    )
df_processed_medications = df_processed_medications.drop(columns=effect_cols)

# Combine the uses of each row into one list
use_cols = [col for col in df_processed_medications.columns if col.startswith('use')]
df_processed_medications['uses'] = df_processed_medications[use_cols] \
    .apply(
        lambda row: sorted(
            [x for x in row.dropna().tolist() if str(x).strip() != '']
        ),
        axis=1
    )
df_processed_medications = df_processed_medications.drop(columns=use_cols)

In [6]:
df_processed_medications.sort_values(by='name').reset_index(drop=True)

Unnamed: 0,name,chemical_class,habit_forming,therapeutic_class,action_class,side_effects,uses
0,1 al plus 5mg/120mg capsule,,no,respiratory,,"[dryness in mouth, headache, nausea, restlessn...",[ sneezing and runny nose due to allergies]
1,1 nvp tablet,,no,gastro intestinal,,"[constipation, dizziness, dryness in mouth, fa...",[treatment of nausea and vomiting in pregnancy]
2,1-al 10 tablet,piperazine derivatives,no,respiratory,h1 antihistaminics (second generation),"[dryness in mouth, fatigue, headache, nasophar...",[treatment of allergic conditions]
3,1-al m syrup,,no,respiratory,,"[diarrhea, dryness in mouth, fatigue, headache...","[treatment of allergic skin conditions, treatm..."
4,1-al syrup,piperazine derivatives,no,respiratory,h1 antihistaminics (second generation),"[constipation, dizziness, dryness in mouth, fa...",[treatment of allergic conditions]
...,...,...,...,...,...,...,...
231014,zyxtil 500mg tablet,intermediate spectrum {second generation cepha...,no,anti infectives,cephalosporins: 2nd generation,"[allergic reaction, diarrhea, increased liver ...",[treatment of bacterial infections]
231015,zyzer syrup,,no,vitamins minerals nutrients,,"[blurred vision, constipation, drowsiness, dry...",[ appetite stimulant]
231016,zyzine 25mg tablet,piperazine derivative,no,respiratory,h1 antihistaminics (first generation),"[constipation, nausea, sedation, upset stomach...","[treatment of anxiety, treatment of skin condi..."
231017,zyzolide 600mg tablet,oxazolidinone derivative,no,anti infectives,oxazolidinone,"[decreased blood cells (red cells, white cells...",[treatment of severe bacterial infections]


In [7]:
df_processed_medications

Unnamed: 0,name,chemical_class,habit_forming,therapeutic_class,action_class,side_effects,uses
0,augmentin 625 duo tablet,,no,anti infectives,,"[diarrhea, nausea, vomiting]",[treatment of bacterial infections]
1,azithral 500 tablet,macrolides,no,anti infectives,macrolides,"[abdominal pain, diarrhea, nausea, vomiting]",[treatment of bacterial infections]
2,ascoril ls syrup,,no,respiratory,,"[allergic reaction, diarrhea, dizziness, heada...",[treatment of cough with mucus]
3,allegra 120mg tablet,diphenylmethane derivative,no,respiratory,h1 antihistaminics (second generation),"[dizziness, drowsiness, headache, nausea]","[treatment of allergic conditions, treatment o..."
4,avil 25 tablet,pyridines derivatives,no,respiratory,h1 antihistaminics (first generation),"[dryness in mouth, sleepiness]",[treatment of allergic conditions]
...,...,...,...,...,...,...,...
298598,simzopox 400mg tablet,nucleoside analog,no,anti infectives,antiviral (non-hiv) drugs,"[diarrhea, dizziness, fatigue, fever, headache...","[ chickenpox, genital herpes infection, herp..."
298609,pisi injection,enolic acid derivatives,no,pain analgesics,nsaid's- non-selective cox 1&2 inhibitors (eno...,"[dizziness, headache, injection site reactions...",[ pain relief]
298613,fevedot 125mg oral suspension,p-aminophenol derivative,no,pain analgesics,analgesic & antipyretic-pcm,"[indigestion, nausea, stomach pain, vomiting]","[ pain relief, treatment of fever]"
298615,acizide 125mg injection,broad spectrum (third & fourth generation ceph...,no,anti infectives,cephalosporins: 3 generation,"[allergic reaction, diarrhea, injection site r...",[ bacterial infections]


In [8]:
df_processed_medications.explode(column='side_effects')

Unnamed: 0,name,chemical_class,habit_forming,therapeutic_class,action_class,side_effects,uses
0,augmentin 625 duo tablet,,no,anti infectives,,diarrhea,[treatment of bacterial infections]
0,augmentin 625 duo tablet,,no,anti infectives,,nausea,[treatment of bacterial infections]
0,augmentin 625 duo tablet,,no,anti infectives,,vomiting,[treatment of bacterial infections]
1,azithral 500 tablet,macrolides,no,anti infectives,macrolides,abdominal pain,[treatment of bacterial infections]
1,azithral 500 tablet,macrolides,no,anti infectives,macrolides,diarrhea,[treatment of bacterial infections]
...,...,...,...,...,...,...,...
298617,setnther-ab injection,sesquiterpene lactones,no,anti malarials,antimalarial- artemisinin and derivatives,abdominal pain,[ malaria]
298617,setnther-ab injection,sesquiterpene lactones,no,anti malarials,antimalarial- artemisinin and derivatives,headache,[ malaria]
298617,setnther-ab injection,sesquiterpene lactones,no,anti malarials,antimalarial- artemisinin and derivatives,"injection site reactions (pain, swelling, redn...",[ malaria]
298617,setnther-ab injection,sesquiterpene lactones,no,anti malarials,antimalarial- artemisinin and derivatives,nausea,[ malaria]


In [9]:
df_processed_medications.explode(column='uses')

Unnamed: 0,name,chemical_class,habit_forming,therapeutic_class,action_class,side_effects,uses
0,augmentin 625 duo tablet,,no,anti infectives,,"[diarrhea, nausea, vomiting]",treatment of bacterial infections
1,azithral 500 tablet,macrolides,no,anti infectives,macrolides,"[abdominal pain, diarrhea, nausea, vomiting]",treatment of bacterial infections
2,ascoril ls syrup,,no,respiratory,,"[allergic reaction, diarrhea, dizziness, heada...",treatment of cough with mucus
3,allegra 120mg tablet,diphenylmethane derivative,no,respiratory,h1 antihistaminics (second generation),"[dizziness, drowsiness, headache, nausea]",treatment of allergic conditions
3,allegra 120mg tablet,diphenylmethane derivative,no,respiratory,h1 antihistaminics (second generation),"[dizziness, drowsiness, headache, nausea]",treatment of sneezing and runny nose due to al...
...,...,...,...,...,...,...,...
298609,pisi injection,enolic acid derivatives,no,pain analgesics,nsaid's- non-selective cox 1&2 inhibitors (eno...,"[dizziness, headache, injection site reactions...",pain relief
298613,fevedot 125mg oral suspension,p-aminophenol derivative,no,pain analgesics,analgesic & antipyretic-pcm,"[indigestion, nausea, stomach pain, vomiting]",pain relief
298613,fevedot 125mg oral suspension,p-aminophenol derivative,no,pain analgesics,analgesic & antipyretic-pcm,"[indigestion, nausea, stomach pain, vomiting]",treatment of fever
298615,acizide 125mg injection,broad spectrum (third & fourth generation ceph...,no,anti infectives,cephalosporins: 3 generation,"[allergic reaction, diarrhea, injection site r...",bacterial infections
