# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Pre-processing of data

## Data Source1

In [2]:
# Load the JSON data (adjust the file path)
with open('./../../NLP_NLG_Recipe_prediction_generation_app_data/Data_source1/Ingredient_list_train/train.json', 'r') as f:
    data = json.load(f)

# Convert the data to a DataFrame
recipes_df = pd.json_normalize(data)

In [3]:
# Convert the 'ingredients' column from lists to tuples
recipes_df['ingredients'] = recipes_df['ingredients'].apply(tuple)

# Now apply drop_duplicates
recipes_df_clean = recipes_df.drop_duplicates()

# Ensure there are no missing values in 'cuisine' and 'ingredients'
recipes_df_clean = recipes_df_clean.dropna(subset=['cuisine', 'ingredients'])

# Extract the ingredient lists and cuisines
ingredients_df = recipes_df_clean[['cuisine', 'ingredients']].copy()

# Display the cleaned data
ingredients_df.head()

Unnamed: 0,cuisine,ingredients
0,greek,"(romaine lettuce, black olives, grape tomatoes..."
1,southern_us,"(plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,"(eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,"(water, vegetable oil, wheat, salt)"
4,indian,"(black pepper, shallots, cornflour, cayenne pe..."


## Data Source2

In [9]:
# Sample 100 random ingredients from the original list
sampled_ingredients_df = ingredients_df.sample(n=100, random_state=1)  # Setting random_state for reproducibility

# Spoonacular API key (replace with your actual API key)
# api_key = '85e0d17788944bbfb85f8a9e2307c94b' # account1 
# api_key = '178e8ce72a2e49ebabbc77490de51430' # account2 
# api_key = 'e47b410ad742498f8509bdc2379bfd09' # account3

# Function to fetch recipes from Spoonacular API for a single list of ingredients
def fetch_spoonacular_recipes(ingredients_list, api_key):
    ingredients_text = ','.join(ingredients_list)
    
    # Spoonacular API endpoint
    url = f"https://api.spoonacular.com/recipes/findByIngredients?ingredients={ingredients_text}&number=5&apiKey={api_key}"
    
    # Send request to Spoonacular API
    response = requests.get(url)
    
    # Parse the response
    if response.status_code == 200:
        recipes = response.json()
        
        # Extract relevant information for each recipe (e.g., title, ingredients, instructions)
        recipe_data = []
        for recipe in recipes:
            recipe_id = recipe['id']
            title = recipe['title']
            
            # Fetch detailed information for each recipe (ingredients, instructions)
            details_url = f"https://api.spoonacular.com/recipes/{recipe_id}/information?apiKey={api_key}"
            details_response = requests.get(details_url)
            
            if details_response.status_code == 200:
                details = details_response.json()
                ingredients = [ingredient['name'] for ingredient in details['extendedIngredients']]
                instructions = details['instructions']
                
                # Append the data to a list
                recipe_data.append({
                    'id': recipe_id,
                    'title': title,
                    'ingredients': ingredients,
                    'instructions': instructions
                })
        
        # Convert the list of recipes to a pandas DataFrame
        return pd.DataFrame(recipe_data)
    
    else:
        print(f"Error: Unable to fetch data. Status Code {response.status_code}")
        return None

# Function to fetch recipes for all ingredient lists extracted from the original dataset
def fetch_recipes_from_original_data(sampled_ingredients_df, api_key):
    all_recipes = pd.DataFrame()  # Initialize an empty DataFrame to store all recipes
    
    for index, row in sampled_ingredients_df.iterrows():
        ingredients = row['ingredients']  # Extract the ingredients list
        print(f"Fetching recipes for ingredients: {ingredients}")
        recipes_df = fetch_spoonacular_recipes(ingredients, api_key)
        
        if recipes_df is not None:
            # Append the new recipes to the existing DataFrame with cuisine information
            recipes_df['cuisine'] = row['cuisine']  # Add cuisine information
            all_recipes = pd.concat([all_recipes, recipes_df], ignore_index=True)
    
    return all_recipes

all_recipes_df = fetch_recipes_from_original_data(sampled_ingredients_df, api_key)

Fetching recipes for ingredients: ('water', 'yoghurt', 'peanut oil', 'ground cumin', 'ground cinnamon', 'fresh ginger root', 'margarine', 'onions', 'tomatoes', 'fresh cilantro', 'chile pepper', 'ground coriander', 'minced garlic', 'ground black pepper', 'cayenne pepper', 'ground turmeric')
Fetching recipes for ingredients: ('lemon zest', 'whipping cream', 'yellow corn meal', 'baking powder', 'all-purpose flour', 'large eggs', 'salt', 'sugar', 'butter')
Fetching recipes for ingredients: ('salad dressing', 'chuck roast', 'garlic', 'ground black pepper')
Fetching recipes for ingredients: ('sesame seeds', 'worcestershire sauce', 'cucumber', 'pepper', 'red pepper', 'carrots', 'dark soy sauce', 'onion powder', 'rolls', 'beef', 'leaf lettuce')
Fetching recipes for ingredients: ('bay leaves', 'cayenne pepper', 'ground cloves', 'vegetable oil', 'cumin', 'chili powder', 'pork butt', 'garlic powder', 'salt')
Fetching recipes for ingredients: ('light brown sugar', 'butter', 'blackberries', 'peache

Error: Unable to fetch data. Status Code 402
Fetching recipes for ingredients: ('light soy sauce', 'ginger', 'oyster sauce', 'sweet rice', 'mushrooms', 'chinese five-spice powder', 'ground white pepper', 'dark soy sauce', 'Shaoxing wine', 'scallions', 'corn starch', 'boneless chicken skinless thigh', 'sea salt', 'oil', 'lotus leaves')
Error: Unable to fetch data. Status Code 402
Fetching recipes for ingredients: ('water', 'bacon slices', 'grits', 'pepper', 'large eggs', 'fresh parsley', 'milk', 'garlic', 'parmesan cheese', 'salt')
Error: Unable to fetch data. Status Code 402
Fetching recipes for ingredients: ('orange', 'lemon', 'large eggs', 'superfine sugar', 'confectioners sugar', 'whole almonds', 'almond extract')
Error: Unable to fetch data. Status Code 402
Fetching recipes for ingredients: ('fresh spinach', 'dijon mustard', 'ground black pepper', 'kosher salt', 'crÃ¨me fraÃ®che', 'pecorino cheese', 'unsalted butter')
Error: Unable to fetch data. Status Code 402
Fetching recipes fo

In [13]:
#Fetch recipes for all ingredient lists in the ingredients_df DataFrame
all_recipes_df.to_pickle('./../../NLP_NLG_Recipe_prediction_generation_app_data/Data_source1/Ingredient_list_train/all_recipes.pkl')

# Display the combined recipes data
if not all_recipes_df.empty:
    print(all_recipes_df.head(n=100))

        id                                              title  \
0   635601                                   Boiled Egg Curry   
1   637264                              Cashew Butter Chicken   
2   637935  Chicken and Vegetarian Tamales With Red Mole S...   
3   640621                        Creamy Chicken Tikka Masala   
4   650484                              Luscious Palak Paneer   
..     ...                                                ...   
95  637798  Chestnut and Wild Mushroom Soup With Parmesan ...   
96  634671                              Beef Shanks Oso Bucco   
97  635574                                   Boeuf Bourgignon   
98  637907           Chicken and lentil stew with cumin pitta   
99  663994             Turkey Bolognese Ragu With Pappardelle   

                                          ingredients  \
0   [canola oil, chilli powder, hardboiled eggs, c...   
1   [cayenne pepper, cumin seeds, fenugreek leaves...   
2   [roma tomatoes, onions, garlic cloves, olive

## Data Source3

In [14]:
# Load Kaggle dataset
kaggle_recipes_df = pd.read_csv(r'./../../NLP_NLG_Recipe_prediction_generation_app_data/Data_source2/RecipeNLG_dataset.csv')

# Check dataset columns and structure
print(kaggle_recipes_df.head())
print(kaggle_recipes_df.columns)

   Unnamed: 0                  title  \
0           0    No-Bake Nut Cookies   
1           1  Jewell Ball'S Chicken   
2           2            Creamy Corn   
3           3          Chicken Funny   
4           4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com

In [15]:
# Rename columns in the Kaggle dataset for consistency
kaggle_recipes_df = kaggle_recipes_df.rename(columns={
    'title': 'recipe_title',
    'ingredients': 'ingredients_list',
    'directions': 'instructions'
})

# Ensure the ingredients are in a list format if they are stored as strings
kaggle_recipes_df['ingredients_list'] = kaggle_recipes_df['ingredients_list'].apply(
    lambda x: x.split(', ') if isinstance(x, str) else []
)

# Convert list of ingredients to a tuple for uniqueness checking
kaggle_recipes_df['ingredients_list'] = kaggle_recipes_df['ingredients_list'].apply(
    lambda x: tuple(x) if isinstance(x, list) else ()
)

# Now we have the Spoonacular recipes stored in all_recipes_df from the previous code
# Ensure the columns match for concatenation in the Spoonacular dataset
all_recipes_df = all_recipes_df.rename(columns={
    'title': 'recipe_title',
    'ingredients': 'ingredients_list'
})

# Combine the two datasets
combined_recipes_df = pd.concat([all_recipes_df, kaggle_recipes_df], ignore_index=True)
combined_recipes_df.drop(columns=['Unnamed: 0','id'], inplace=True)

# Display the shape of the final combined dataset
print(f"Final Combined Recipes DataFrame Shape: {combined_recipes_df.shape}")

# Display the first few rows of the combined dataset
print("Combined Recipes DataFrame:")
print(combined_recipes_df.head())

Final Combined Recipes DataFrame Shape: (2231264, 7)
Combined Recipes DataFrame:
                                        recipe_title  \
0                                   Boiled Egg Curry   
1                              Cashew Butter Chicken   
2  Chicken and Vegetarian Tamales With Red Mole S...   
3                        Creamy Chicken Tikka Masala   
4                              Luscious Palak Paneer   

                                    ingredients_list  \
0  [canola oil, chilli powder, hardboiled eggs, c...   
1  [cayenne pepper, cumin seeds, fenugreek leaves...   
2  [roma tomatoes, onions, garlic cloves, olive o...   
3  [pepper, tomato sauce, cayenne pepper, cilantr...   
4  [baking powder, cardamom powder, cinnamon powd...   

                                        instructions cuisine link source  NER  
0  <ol><li>Boil 4 eggs in boiling water for about...  indian  NaN    NaN  NaN  
1  <ol><li>Preheat oven to 425F. Line a baking tr...  indian  NaN    NaN  NaN  
2  On

# IDA | Initial Data Analysis

## Describe the data

In [16]:
# Display data overview
print(f"Shape of DataFrame: {combined_recipes_df.shape}")
print("\nData Types and Missing Values:")
print(combined_recipes_df.info())

# Since there are no numerical columns, we'll describe object columns instead
print("\nBasic Stats for Textual Columns:")
print(combined_recipes_df.describe(include='object'))

# Checking how many missing values we have for each column
print("\nMissing Values in Each Column:")
missing_values = combined_recipes_df.isnull().sum()
print(missing_values)

# Percentage of missing values
print("\nPercentage of Missing Values:")
print((missing_values / len(combined_recipes_df)) * 100)

Shape of DataFrame: (2231264, 7)

Data Types and Missing Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231264 entries, 0 to 2231263
Data columns (total 7 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   recipe_title      object
 1   ingredients_list  object
 2   instructions      object
 3   cuisine           object
 4   link              object
 5   source            object
 6   NER               object
dtypes: object(7)
memory usage: 119.2+ MB
None

Basic Stats for Textual Columns:
             recipe_title                                 ingredients_list  \
count             2231264                                          2231264   
unique            1312924                                          2226476   
top     Chicken Casserole  (["1 c. peanut butter", "1 c. sugar", "1 egg"])   
freq                 4099                                               28   

                             instructions cuisine  \
count                       

## Data Imputation

In [17]:
# Step 1: Define ingredient-to-cuisine mapping
ingredient_cuisine_map = {
    'turmeric': 'indian',
    'garam masala': 'indian',
    'ghee': 'indian',
    'tortilla': 'mexican',
    'jalapeno': 'mexican',
    'cilantro': 'mexican',
    'soy sauce': 'chinese',
    'tofu': 'chinese',
    'saffron': 'mediterranean',
    'pita': 'mediterranean',
    'basil': 'italian',
    'mozzarella': 'italian',
    'parmesan': 'italian',
    # Add more mappings as needed
}

# Function to infer cuisine based on ingredients
def infer_cuisine_from_ingredients(ingredients_list):
    # Flatten ingredients list into one string to make keyword matching easier
    ingredients_str = ' '.join(ingredients_list).lower()
    
    # Check if any ingredient matches our predefined ingredient-to-cuisine map
    for ingredient, cuisine in ingredient_cuisine_map.items():
        if ingredient in ingredients_str:
            return cuisine
    return None  # If no match is found, return None

# Apply the function to fill missing cuisines
combined_recipes_df['cuisine'] = combined_recipes_df.apply(
    lambda row: infer_cuisine_from_ingredients(row['ingredients_list']) if pd.isnull(row['cuisine']) else row['cuisine'], axis=1
)

# Check the update
print("Updated 'cuisine' column after ingredient-based imputation:")
print(combined_recipes_df['cuisine'].value_counts())
print(f"Missing values in 'cuisine' after ingredient-based imputation: {combined_recipes_df['cuisine'].isna().sum()}")

Updated 'cuisine' column after ingredient-based imputation:
italian          192621
mexican          122179
chinese           76991
indian            23161
mediterranean     10623
southern_us          15
french               10
jamaican              7
japanese              5
moroccan              5
greek                 5
vietnamese            5
brazilian             3
cajun_creole          2
Name: cuisine, dtype: int64
Missing values in 'cuisine' after ingredient-based imputation: 1805632


In [18]:
# Step 2: Define recipe title-to-cuisine mapping
title_cuisine_map = {
    'tikka': 'indian',
    'curry': 'indian',
    'biryani': 'indian',
    'tacos': 'mexican',
    'quesadilla': 'mexican',
    'enchilada': 'mexican',
    'pasta': 'italian',
    'lasagna': 'italian',
    'pizza': 'italian',
    'sushi': 'japanese',
    'ramen': 'japanese',
    'paella': 'spanish',
    'gazpacho': 'spanish',
    'souvlaki': 'greek',
    'gyros': 'greek',
    'falafel': 'middle eastern',
    'shawarma': 'middle eastern',
    'gumbo': 'southern_us',
    'bbq': 'southern_us',
    'brisket': 'southern_us',
    'pho': 'vietnamese',
    'banh mi': 'vietnamese',
    # Add more mappings as needed
}

# Function to infer cuisine based on recipe title
def infer_cuisine_from_title(recipe_title):
    recipe_title = recipe_title.lower()  # Convert title to lowercase for matching
    for keyword, cuisine in title_cuisine_map.items():
        if keyword in recipe_title:
            return cuisine
    return None

# Apply the function to fill missing cuisines
combined_recipes_df['cuisine'] = combined_recipes_df.apply(
    lambda row: infer_cuisine_from_title(row['recipe_title']) if pd.isnull(row['cuisine']) else row['cuisine'], axis=1
)

# Check the update
print("Updated 'cuisine' column after title-based imputation:")
print(combined_recipes_df['cuisine'].value_counts())
print(f"Missing values in 'cuisine' after title-based imputation: {combined_recipes_df['cuisine'].isna().sum()}")

Updated 'cuisine' column after title-based imputation:
italian           216025
mexican           124133
chinese            76991
indian             27987
mediterranean      10623
southern_us         8682
spanish             1213
japanese            1200
middle eastern       231
vietnamese           204
greek                188
french                10
jamaican               7
moroccan               5
brazilian              3
cajun_creole           2
Name: cuisine, dtype: int64
Missing values in 'cuisine' after title-based imputation: 1763760


### Advanced method for Data imputations

In [19]:
# Convert ingredients_list to a single string for each recipe
combined_recipes_df['ingredients_str'] = combined_recipes_df['ingredients_list'].apply(lambda x: ' '.join(x))

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the ingredients strings
ingredient_vectors = tfidf_vectorizer.fit_transform(combined_recipes_df['ingredients_str'])

print(f"Shape of TF-IDF ingredient vectors: {ingredient_vectors.shape}")

Shape of TF-IDF ingredient vectors: (2231264, 72523)


In [20]:
# Step 1: Apply Truncated SVD to reduce dimensionality
n_components = 100  # Choose the number of components
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Transform the TF-IDF vectors into a lower-dimensional space
ingredient_vectors_reduced = svd.fit_transform(ingredient_vectors)

# Step 2: Fit K-Means clustering on the reduced data
n_clusters = 20  # Set the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit the model and predict cluster assignments
combined_recipes_df['cluster'] = kmeans.fit_predict(ingredient_vectors_reduced)

# Check the first few cluster assignments
print(combined_recipes_df[['cuisine', 'cluster']].head())

  cuisine  cluster
0  indian       14
1  indian       14
2  indian       12
3  indian        9
4  indian       14


In [21]:
# Create a mapping of clusters to the most common cuisine within that cluster
cluster_to_cuisine = combined_recipes_df.groupby('cluster')['cuisine'].apply(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')

# Fill missing cuisine values based on cluster assignment
combined_recipes_df.loc[combined_recipes_df['cuisine'].isnull(), 'cuisine'] = combined_recipes_df['cluster'].map(cluster_to_cuisine)

# Check how many missing values remain
print(f"Missing values in 'cuisine' after K-Means based imputation: {combined_recipes_df['cuisine'].isna().sum()}")

Missing values in 'cuisine' after K-Means based imputation: 0


In [22]:
combined_recipes_df.to_pickle('./../../NLP_NLG_Recipe_prediction_generation_app_data/Data_model/processed_recipes_data.pkl')