In [71]:
# import dependecies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from scipy import stats
import kagglehub
import json
import os

In [72]:
# downloading dataset and showing path on local machine to dataset
path = kagglehub.dataset_download("shayanhusain/drug-food-interactions-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/mikayla/.cache/kagglehub/datasets/shayanhusain/drug-food-interactions-dataset/versions/1


In [73]:
# dataset is directory so getting just the one file
json_file = os.path.join(path, 'Drug to Food Interactions Dataset.json')

In [74]:
# verifying file before loading
print("Looking for file:", json_file)
print("File exists?", os.path.exists(json_file))

Looking for file: /Users/mikayla/.cache/kagglehub/datasets/shayanhusain/drug-food-interactions-dataset/versions/1/Drug to Food Interactions Dataset.json
File exists? True


In [75]:
# loading data
if os.path.exists(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    print("Successfully loaded JSON file!")
    print(type(data))

Successfully loaded JSON file!
<class 'list'>


In [76]:
# making data a dataframe
data = pd.DataFrame(data)

In [77]:
data.head()

Unnamed: 0,name,reference,food_interactions
0,Lepirudin,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Avoid herbs and supplements with anticoagulan...
1,Bivalirudin,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...","[Avoid echinacea., Avoid herbs and supplements..."
2,Peginterferon alfa-2a,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Drink plenty of fluids.]
3,Alteplase,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Avoid herbs and supplements with anticoagulan...
4,Interferon alfa-n1,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Avoid alcohol.]


In [78]:
# don't really care where the info is from
data = data.drop('reference', axis=1)

In [79]:
# this becomes helpful once interactions are exploded; will rearrange columns later
data['drug_index'] = data.index

In [80]:
# verifies column as string
data['food_interactions'] = data['food_interactions'].astype(str)

In [81]:
# cleans those strings to eventually get each interaction as own record
def cleanInteractions(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("St. John's Wort", "St John's Wort")
    text = text.replace(']', '').replace('[', '')
    text = text.strip()
    return text

In [82]:
# extracts specifically interactions and nothing else
def extractInteractions(text):
    if isinstance(text, str):
        sentences = text.split('.')
        return [s.strip() for s in sentences
            if s.strip() and not s.strip().lower().startswith('examples include')]
        
    return []            

In [83]:
data['cleaned_text'] = data['food_interactions'].apply(cleanInteractions)
data['interaction'] = data['cleaned_text'].apply(extractInteractions)

In [84]:
# explodes interactions that have been cleaned and extracted to each row
data = data.explode('interaction').reset_index(drop=True)
data = data[data['interaction'].str.len() > 2]
data = data.reset_index(drop=True)

In [85]:
data.head()

Unnamed: 0,name,food_interactions,drug_index,cleaned_text,interaction
0,Lepirudin,['Avoid herbs and supplements with anticoagula...,0,'Avoid herbs and supplements with anticoagulan...,'Avoid herbs and supplements with anticoagulan...
1,Bivalirudin,"['Avoid echinacea.', 'Avoid herbs and suppleme...",1,"'Avoid echinacea.', 'Avoid herbs and supplemen...",'Avoid echinacea
2,Bivalirudin,"['Avoid echinacea.', 'Avoid herbs and suppleme...",1,"'Avoid echinacea.', 'Avoid herbs and supplemen...","', 'Avoid herbs and supplements with anticoagu..."
3,Peginterferon alfa-2a,['Drink plenty of fluids.'],2,'Drink plenty of fluids.','Drink plenty of fluids
4,Alteplase,['Avoid herbs and supplements with anticoagula...,3,'Avoid herbs and supplements with anticoagulan...,'Avoid herbs and supplements with anticoagulan...


In [88]:
data = data.drop('cleaned_text', axis=1)
data.to_csv('drug_to_food_interactions.csv')