In [2]:
# import dependecies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from scipy import stats
import kagglehub
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# downloading dataset and showing path on local machine to dataset
path = kagglehub.dataset_download("shayanhusain/drug-food-interactions-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/mikayla/.cache/kagglehub/datasets/shayanhusain/drug-food-interactions-dataset/versions/1


In [4]:
# dataset is directory so getting just the one file
json_file = os.path.join(path, 'Drug to Food Interactions Dataset.json')

In [5]:
# verifying file before loading
print("Looking for file:", json_file)
print("File exists?", os.path.exists(json_file))

Looking for file: /Users/mikayla/.cache/kagglehub/datasets/shayanhusain/drug-food-interactions-dataset/versions/1/Drug to Food Interactions Dataset.json
File exists? True


In [6]:
# loading data
if os.path.exists(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    print("Successfully loaded JSON file!")
    print(type(data))

Successfully loaded JSON file!
<class 'list'>


In [7]:
# making data a dataframe
data = pd.DataFrame(data)

In [8]:
data.head()

Unnamed: 0,name,reference,food_interactions
0,Lepirudin,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Avoid herbs and supplements with anticoagulan...
1,Bivalirudin,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...","[Avoid echinacea., Avoid herbs and supplements..."
2,Peginterferon alfa-2a,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Drink plenty of fluids.]
3,Alteplase,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Avoid herbs and supplements with anticoagulan...
4,Interferon alfa-n1,"Knox C, Wilson M, Klinger CM, et al. DrugBank ...",[Avoid alcohol.]


In [9]:
# don't really care where the info is from
data = data.drop('reference', axis=1)

In [10]:
# this becomes helpful once interactions are exploded; will rearrange columns later
data['drug_index'] = data.index

In [11]:
# verifies column as string
data['food_interactions'] = data['food_interactions'].astype(str)

In [12]:
# cleans those strings to eventually get each interaction as own record
def cleanInteractions(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("St. John's Wort", "St John's Wort")
    text = text.replace(']', '').replace('[', '')
    text = text.strip()
    return text

In [13]:
# extracts specifically interactions and nothing else
def extractInteractions(text):
    if isinstance(text, str):
        sentences = text.split('.')
        return [s.strip() for s in sentences
            if s.strip() and not s.strip().lower().startswith('examples include')]
        
    return []            

In [14]:
data['cleaned_text'] = data['food_interactions'].apply(cleanInteractions)
data['interaction'] = data['cleaned_text'].apply(extractInteractions)

In [15]:
# explodes interactions that have been cleaned and extracted to each row
data = data.explode('interaction').reset_index(drop=True)
data = data[data['interaction'].str.len() > 2]
data = data.reset_index(drop=True)

In [16]:
# dropping columns that now aren't helpful
data = data.drop('cleaned_text', axis=1)
data = data.drop('food_interactions', axis=1)

In [17]:
# cleaning the strings up more
def cleanStrings(text):
    if pd.isna(text) or not isinstance(text, str):
        return text
    
    text = text.strip()
    
    while text and text[0] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[1:]
    while text and text[-1] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[1:]
    
    if text[1] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[2:]
    elif text[2] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[3:]
    
    text = ' '.join(text.split())
    return text
data['interaction'] = data['interaction'].apply(cleanStrings)

In [18]:
data.head()

Unnamed: 0,name,drug_index,interaction
0,Lepirudin,0,Avoid herbs and supplements with anticoagulant...
1,Bivalirudin,1,Avoid echinacea
2,Bivalirudin,1,Avoid herbs and supplements with anticoagulant...
3,Peginterferon alfa-2a,2,Drink plenty of fluids
4,Alteplase,3,Avoid herbs and supplements with anticoagulant...


In [19]:
data.to_csv('interactions.csv')

In [None]:
# see interactions with similar words/phrases


In [20]:
# to replace less common instructions/interactions with more common ones to reduce distinct ones for easier analysis
data['interaction'] = data['interaction'].replace(
    'The absorption is unaffected by food',
    'Take with or without food')
