In [None]:
# import dependecies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from scipy import stats
import kagglehub
import json
import os
import re

In [None]:
# downloading dataset and showing path on local machine to dataset
path = kagglehub.dataset_download("shayanhusain/drug-food-interactions-dataset")

print("Path to dataset files:", path)

In [None]:
# dataset is directory so getting just the one file
json_file = os.path.join(path, 'Drug to Food Interactions Dataset.json')

In [None]:
# verifying file before loading
print("Looking for file:", json_file)
print("File exists?", os.path.exists(json_file))

In [None]:
# loading data
if os.path.exists(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    print("Successfully loaded JSON file!")
    print(type(data))

In [None]:
# making data a dataframe
data = pd.DataFrame(data)

In [None]:
data.head()

In [None]:
# don't really care where the info is from
data = data.drop('reference', axis=1)

In [None]:
# this becomes helpful once interactions are exploded; will rearrange columns later
data['drug_index'] = data.index

In [None]:
# verifies column as string
data['food_interactions'] = data['food_interactions'].astype(str)

In [None]:
# cleans those strings to eventually get each interaction as own record
def cleanInteractions(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("St. John's Wort", "St John's Wort")
    text = text.replace(']', '').replace('[', '')
    text = text.strip()
    return text

In [None]:
# extracts specifically interactions and nothing else
def extractInteractions(text):
    if isinstance(text, str):
        sentences = text.split('.')
        return [s.strip() for s in sentences
            if s.strip() and not s.strip().lower().startswith('examples include')]
        
    return []            

In [None]:
data['cleaned_text'] = data['food_interactions'].apply(cleanInteractions)
data['interaction'] = data['cleaned_text'].apply(extractInteractions)

In [None]:
# explodes interactions that have been cleaned and extracted to each row
data = data.explode('interaction').reset_index(drop=True)
data = data[data['interaction'].str.len() > 2]
data = data.reset_index(drop=True)

In [None]:
# dropping columns that now aren't helpful
data = data.drop('cleaned_text', axis=1)
data = data.drop('food_interactions', axis=1)

In [None]:
# cleaning the strings up more
def cleanStrings(text):
    if pd.isna(text) or not isinstance(text, str):
        return text
    
    text = text.strip()
    
    while text and text[0] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[1:]
    while text and text[-1] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[1:]
    
    if text[1] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[2:]
    elif text[2] in ['"', "'", "','", '","', ', ', ",'"]:
        text = text[3:]
    
    text = ' '.join(text.split())
    return text
data['interaction'] = data['interaction'].apply(cleanStrings)

In [None]:
data.head()

In [None]:
#data.to_csv('interactions.csv')

In [None]:
# to replace less common instructions/interactions with more common ones to reduce distinct ones for easier analysis
data['interaction'] = data['interaction'].replace(
    'The absorption is unaffected by food',
    'Take with or without food')


In [None]:
# beginning the tokenization process using interaction column
def tokenize(text):
    tokens = re.findall(r'\b\w\b', text.lower())
    return list(set(tokens))
data['tokens'] = data['interaction'].apply(tokenize)