# Download file:

In [37]:
# !pip install openpyxl
# !pip freeze > ../requirements.txt

In [38]:
from __future__ import annotations
import os, json, time, shutil
from typing import Iterable, List, Dict, Any, Optional
import requests
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # Loads .env variables (e.g., SPOONACULAR_API_KEY

True

In [39]:
DATA_DIR = os.path.abspath("../data")
os.makedirs(DATA_DIR, exist_ok=True)

# Choose the upstream API you want to use ("spoonacular" or "themealdb")
API = os.getenv("RECIPE_API", "themealdb").strip().lower()

SPOONACULAR_API_KEY = os.getenv("SPOONACULAR_API_KEY")  # put in .env
CUISINES = [
    "African","Asian","American","British","Cajun","Caribbean","Chinese",
    "Eastern European","European","French","German","Greek","Indian","Irish",
    "Italian","Japanese","Jewish","Korean","Latin American","Mediterranean",
    "Mexican","Middle Eastern","Nordic","Southern","Spanish","Thai","Vietnamese"
]
MEALDB_FIRST_LETTERS = list("abcdefghijklmnopqrstuvwxyz")

In [40]:
def reset_kagglehub_dataset(dataset_name: str) -> None:
    """Remove cached kagglehub directory for a dataset (for a clean re-download)."""
    cache_dir = os.path.expanduser("~/.cache/kagglehub/datasets")
    dataset_dir = os.path.join(cache_dir, dataset_name.replace('/', os.sep))
    if os.path.exists(dataset_dir):
        shutil.rmtree(dataset_dir)
        print(f"Removed dataset cache: {dataset_dir}")
    else:
        print(f"No cache found for: {dataset_dir}")

def download_and_move_kaggle_file(
    out_path: str,
    kaggle_dataset: str,
    kaggle_filename: str,
    replace_cache: bool = False
) -> bool:
    """Download a file from a Kaggle dataset (via kagglehub) and move to out_path."""
    import kagglehub

    if replace_cache:
        reset_kagglehub_dataset(kaggle_dataset)

    kaggle_dir = kagglehub.dataset_download(kaggle_dataset)
    src = os.path.join(kaggle_dir, kaggle_filename)

    if not os.path.exists(src):
        print(f"[WARN] {kaggle_filename} not found in {kaggle_dir}")
        return False

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    if os.path.exists(out_path):
        os.remove(out_path)
        print(f"Removed existing file: {out_path}")

    shutil.move(src, out_path)
    print(f"Saved {kaggle_filename} -> {out_path}")
    return True

KAGGLE_PATHS = [
    ("alaknandaa/recipes-data-by-cuisine", "all_cuisines.xlsx"),
    ("wilmerarltstrmberg/recipe-dataset-over-2m", "recipes_data.csv"),
    ("snehallokesh31096/recipe", "recipes_82k.csv"),
    ("mfarazf/cuisine-dataset", "New file.csv"),
    ("sarthak71/food-recipes", "food_recipes.csv"),
    ("ajitrajput/foodrecipes", "recipes.csv"),
]

def safe_read_df(path: str) -> Optional[pd.DataFrame]:
    try:
        if path.lower().endswith(".csv"):
            return pd.read_csv(path)
        if path.lower().endswith((".xlsx", ".xls")):
            return pd.read_excel(path)
        print(f"[WARN] Unsupported file type: {path}")
        return None
    except Exception as e:
        print(f"[ERROR] Reading {path}: {e}")
        return None

def download_kaggle_sources(paths=KAGGLE_PATHS) -> Dict[str, pd.DataFrame]:
    """Download configured Kaggle files and return {nickname: DataFrame}."""
    dfs: Dict[str, pd.DataFrame] = {}
    for dataset, filename in paths:
        nick = dataset.replace("/", "__")
        out_path = os.path.join(DATA_DIR, f"{nick}.{filename.split('.')[-1]}")

        try:
            ok = download_and_move_kaggle_file(
                out_path, kaggle_dataset=dataset, kaggle_filename=filename, replace_cache=True
            )
            if not ok:
                continue
            df = safe_read_df(out_path)
            if df is not None and not df.empty:
                dfs[nick] = df
                display(df.head())
        except Exception as e:
            print(f"[ERROR] Processing {dataset}: {e}")
            continue
    return dfs


In [41]:
kaggle_frames = download_kaggle_sources()


Removed dataset cache: C:\Users\georg.DESKTOP-2FS9VF1/.cache/kagglehub/datasets\alaknandaa\recipes-data-by-cuisine
Downloading from https://www.kaggle.com/api/v1/datasets/download/alaknandaa/recipes-data-by-cuisine?dataset_version_number=1...


100%|██████████| 3.28M/3.28M [00:01<00:00, 2.97MB/s]

Extracting files...





Saved all_cuisines.xlsx -> c:\Users\georg.DESKTOP-2FS9VF1\source\repos\699-capstone-team14\data\alaknandaa__recipes-data-by-cuisine.xlsx


Unnamed: 0,title,total_time,serving_size,ingr,instructions,nativeCuisine
0,Jamaican Fried Dumplings,20,6 servings,4 cups all-purpose flour,"In a large bowl, stir together the flour, baki...",carribean
1,Jamaican Fried Dumplings,20,6 servings,2 teaspoons baking powder,,carribean
2,Jamaican Fried Dumplings,20,6 servings,1 ½ teaspoons salt,,carribean
3,Jamaican Fried Dumplings,20,6 servings,½ cup butter,,carribean
4,Jamaican Fried Dumplings,20,6 servings,½ cup cold water,,carribean


Removed dataset cache: C:\Users\georg.DESKTOP-2FS9VF1/.cache/kagglehub/datasets\wilmerarltstrmberg\recipe-dataset-over-2m
Downloading from https://www.kaggle.com/api/v1/datasets/download/wilmerarltstrmberg/recipe-dataset-over-2m?dataset_version_number=2...


 17%|█▋        | 107M/635M [00:27<02:15, 4.10MB/s] 


KeyboardInterrupt: 

In [42]:
def spoonacular_complex_search_ids(api_key: str, cuisine: str, number: int = 100) -> List[int]:
    """Collect recipe IDs from Spoonacular complexSearch for a given cuisine."""
    url = "https://api.spoonacular.com/recipes/complexSearch"
    params = {"cuisine": cuisine.lower(), "number": number, "apiKey": api_key}
    try:
        r = requests.get(url, params=params, timeout=20)
        if not r.ok:
            print(f"[Spoonacular] {r.status_code} for cuisine={cuisine}: {(r.text or '')[:180]}")
            return []
        data = r.json()
        return [it["id"] for it in data.get("results", []) if "id" in it]
    except Exception as e:
        print(f"[Spoonacular] Error for cuisine={cuisine}: {e}")
        return []

def spoonacular_information_bulk(
    api_key: str, ids: Iterable[int], chunk_size: int = 100, pause: float = 0.25, max_retries: int = 3
) -> List[dict]:
    """Fetch recipe details in chunks from Spoonacular /recipes/informationBulk."""
    url = "https://api.spoonacular.com/recipes/informationBulk"
    ids = list(map(int, ids))
    out: List[dict] = []

    for i in range(0, len(ids), chunk_size):
        batch = ids[i:i + chunk_size]
        params = {"ids": ",".join(map(str, batch)), "apiKey": api_key}

        for attempt in range(1, max_retries + 1):
            try:
                resp = requests.get(url, params=params, timeout=30)
                if resp.status_code == 429:
                    # Rate limit: linear backoff
                    time.sleep(pause * attempt + 0.5)
                    continue
                if not resp.ok:
                    preview = (resp.text or "")[:200].replace("\n", " ")
                    print(f"[Bulk {i//chunk_size}] HTTP {resp.status_code} {preview}")
                    break  # don't retry non-429 errors by default
                try:
                    data = resp.json()
                except ValueError:
                    print(f"[Bulk {i//chunk_size}] Non-JSON response")
                    break

                if isinstance(data, list):
                    out.extend(data)
                elif isinstance(data, dict) and "recipes" in data and isinstance(data["recipes"], list):
                    out.extend(data["recipes"])
                else:
                    print(f"[Bulk {i//chunk_size}] Unexpected JSON type={type(data)}")
                break  # success -> exit retry loop
            except Exception as e:
                if attempt == max_retries:
                    print(f"[Bulk {i//chunk_size}] Failed after {attempt} attempts: {e}")
                else:
                    time.sleep(pause * attempt)
        time.sleep(pause)
    return out

def themealdb_search_by_first_letter(letter: str) -> List[dict]:
    """Pull meals whose names start with `letter` from TheMealDB."""
    url = f"https://www.themealdb.com/api/json/v1/1/search.php"
    try:
        resp = requests.get(url, params={"f": letter}, timeout=20)
        if not resp.ok:
            print(f"[TheMealDB] {resp.status_code} for f={letter}")
            return []
        data = resp.json()
        return data.get("meals") or []
    except Exception as e:
        print(f"[TheMealDB] Error for f={letter}: {e}")
        return []

def fetch_upstream_recipes() -> List[dict]:
    """Main orchestrator for the selected API."""
    if API == "spoonacular":
        if not SPOONACULAR_API_KEY:
            print("[WARN] SPOONACULAR_API_KEY missing; skipping Spoonacular pull.")
            return []
        # 1) collect IDs per cuisine
        all_ids: List[int] = []
        for c in CUISINES:
            all_ids.extend(spoonacular_complex_search_ids(SPOONACULAR_API_KEY, c))
        all_ids = sorted(set(all_ids))
        print(f"Total unique Spoonacular IDs: {len(all_ids)}")

        # 2) bulk fetch details
        return spoonacular_information_bulk(SPOONACULAR_API_KEY, all_ids)

    elif API == "themealdb":
        payload: List[dict] = []
        for ch in MEALDB_FIRST_LETTERS:
            payload.extend(themealdb_search_by_first_letter(ch))
        print(f"Total TheMealDB recipes: {len(payload)}")
        return payload

    else:
        print(f"[WARN] Unsupported API: {API}")
        return []


In [44]:
# Run: API pull 
api_recipes = fetch_upstream_recipes()

#  Save API results 
api_json_path = os.path.join(DATA_DIR, "recipe_api_data.json")
api_csv_path = os.path.join(DATA_DIR, "recipe_api_data.csv")

with open(api_json_path, "w") as f:
    json.dump(api_recipes, f)

pd.DataFrame(api_recipes).to_csv(api_csv_path, index=False)
print(f"Saved API JSON -> {api_json_path}")
print(f"Saved API CSV  -> {api_csv_path}")



Total TheMealDB recipes: 319
Saved API JSON -> c:\Users\georg.DESKTOP-2FS9VF1\source\repos\699-capstone-team14\data\recipe_api_data.json
Saved API CSV  -> c:\Users\georg.DESKTOP-2FS9VF1\source\repos\699-capstone-team14\data\recipe_api_data.csv


In [None]:
#  Quick peek 
if api_recipes:
    display(pd.DataFrame(api_recipes).head())

for nick, df in kaggle_frames.items():
    print(f"{nick}: {len(df):,} rows, {len(df.columns)} cols")

## Composition

We've got different datasets that we have access to - the usual approach would be to consolidate each table into a single large dataframe. Is this the best solution? It depends on our scope. In theory we can create a map, stick it in a DB and call it for info on each table.

If we are only interested in matching ingredients (not time spent, etc.) we can simply do something similar to this:



| Dataset ID    | Index | Encodings                   |
|---------------|-------|-----------------------------|
| 1             | 3     | [123,421,2,28479]           |
| 2             | 7     | [98, 204, 17, 3902]         |
| 3             | 12    | [56, 789, 34, 1201]         |
| 4             | 5     | [301, 22, 88, 4500]         |
| 5             | 9     | [77, 333, 19, 8765]         |
| 6             | 2     | [210, 654, 31, 9999]        |
| 7             | 8     | [44, 555, 23, 1234]         |

We can then just grab the entire row and get the full information.

We can go even further and pretty much automate the process: By running a model that identifies which column is most likely the ingredients tab, we can in theory add any dataset with minimal work on our side.

This process may also be replicable for any of the columns, adding additional features etc.