In [1]:
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz
import time

In [2]:
df_ingredients = pd.read_csv("recipes_ingredients.csv", usecols=["id", "ingredients"])
df_ingredients.shape

(500471, 2)

In [3]:
df_recipes = pd.read_csv("recipes.csv", usecols=["RecipeId", "Name", "Description", "TotalTime", "Keywords", "AggregatedRating", "RecipeInstructions"])
df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   RecipeId            522517 non-null  int64  
 1   Name                522517 non-null  object 
 2   TotalTime           522517 non-null  object 
 3   Description         522512 non-null  object 
 4   Keywords            505280 non-null  object 
 5   AggregatedRating    269294 non-null  float64
 6   RecipeInstructions  522517 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 27.9+ MB


In [4]:
df_recipes = df_recipes.rename(columns={"RecipeId" : "id"})
df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  522517 non-null  int64  
 1   Name                522517 non-null  object 
 2   TotalTime           522517 non-null  object 
 3   Description         522512 non-null  object 
 4   Keywords            505280 non-null  object 
 5   AggregatedRating    269294 non-null  float64
 6   RecipeInstructions  522517 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 27.9+ MB


In [5]:
df_merged = pd.merge(df_ingredients, df_recipes, on="id", how="inner")
df_merged.head()

Unnamed: 0,id,ingredients,Name,TotalTime,Description,Keywords,AggregatedRating,RecipeInstructions
0,71247,"[""cherry pie filling"", ""condensed milk"", ""melt...",Cherry Streusel Cobbler,PT1H,"I haven't made this in years, so I'm just gues...","c(""Cherries"", ""Fruit"", ""< 60 Mins"", ""Oven"")",,"c(""Preheat oven to 375°F."", ""Spread cherry pie..."
1,76133,"[""corned beef chopped"", ""sauerkraut cold water...",Reuben and Swiss Casserole Bake,PT40M,I think this is even better than a reuben sand...,"c(""< 60 Mins"", ""Oven"")",5.0,"c(""Set oven to 350 degrees F."", ""Butter a 9 x ..."
2,503816,"[""unsalted butter"", ""vegetable oil"", ""all - pu...",Yam-Pecan Recipe,PT1H15M,A lady I work with heard me taking about ZWT a...,"c(""Breads"", ""Nuts"", ""Cajun"", ""< 4 Hours"", ""Easy"")",,"c(""Preheat oven to 350°F In a mixing bowl, us..."
3,418749,"[""orange cake mix"", ""instant vanilla pudding"",...",Tropical Orange Layer Cake,PT40M,Make and share this Tropical Orange Layer Cak...,"c(""Low Protein"", ""< 60 Mins"", ""For Large Groups"")",,"c(""In a large mixing bowl, combine the first 6..."
4,392934,"[""butter"", ""brown sugar"", ""granulated sugar"", ...",Safe to Eat Raw Chocolate Chip Oreo Cookie &qu...,PT15M,I was searching the web for something like thi...,"c(""Cookie & Brownie"", ""< 15 Mins"", ""For Large ...",,"c(""Cream butter and sugars together."", ""Blend ..."


In [6]:
#converting total time from hours to mins
def hours_to_mins(time_str):
    if pd.isna(time_str):
        return 30 #default 30 mins if time is missing

    time_str = str(time_str).upper()
    hours = 0
    minutes = 0

    h_match = re.search(r'(\d+)H', time_str)
    if h_match:
        hours = int(h_match.group(1))
        
    # Find minutes (e.g., '30M')
    m_match = re.search(r'(\d+)M', time_str)
    if m_match:
        minutes = int(m_match.group(1))
        
    total_minutes = (hours * 60) + minutes
    
    # If no H or M found (e.g., 'PT' or 'P1D'), return a default
    if total_minutes == 0:
        return 30
        
    return total_minutes
df_merged["TotalTime"] = df_merged["TotalTime"].apply(hours_to_mins)
df_merged.head()


Unnamed: 0,id,ingredients,Name,TotalTime,Description,Keywords,AggregatedRating,RecipeInstructions
0,71247,"[""cherry pie filling"", ""condensed milk"", ""melt...",Cherry Streusel Cobbler,60,"I haven't made this in years, so I'm just gues...","c(""Cherries"", ""Fruit"", ""< 60 Mins"", ""Oven"")",,"c(""Preheat oven to 375°F."", ""Spread cherry pie..."
1,76133,"[""corned beef chopped"", ""sauerkraut cold water...",Reuben and Swiss Casserole Bake,40,I think this is even better than a reuben sand...,"c(""< 60 Mins"", ""Oven"")",5.0,"c(""Set oven to 350 degrees F."", ""Butter a 9 x ..."
2,503816,"[""unsalted butter"", ""vegetable oil"", ""all - pu...",Yam-Pecan Recipe,75,A lady I work with heard me taking about ZWT a...,"c(""Breads"", ""Nuts"", ""Cajun"", ""< 4 Hours"", ""Easy"")",,"c(""Preheat oven to 350°F In a mixing bowl, us..."
3,418749,"[""orange cake mix"", ""instant vanilla pudding"",...",Tropical Orange Layer Cake,40,Make and share this Tropical Orange Layer Cak...,"c(""Low Protein"", ""< 60 Mins"", ""For Large Groups"")",,"c(""In a large mixing bowl, combine the first 6..."
4,392934,"[""butter"", ""brown sugar"", ""granulated sugar"", ...",Safe to Eat Raw Chocolate Chip Oreo Cookie &qu...,15,I was searching the web for something like thi...,"c(""Cookie & Brownie"", ""< 15 Mins"", ""For Large ...",,"c(""Cream butter and sugars together."", ""Blend ..."


In [7]:
#creating the difficulty column
def difficulty(row):
    tags = str(row["Keywords"]).lower()
    TotalTime = row["TotalTime"]

    if "easy" in tags:
        return "Easy"
    if "intermediate" in tags:
        return "Medium"
    if "advanced" in tags or "difficult" in tags:
        return "Hard"

    if TotalTime <= 40:
        return "Easy"
    if TotalTime <= 120:
        return "Medium"
    return "Hard"

df_merged["Difficulty"] = df_merged.apply(difficulty, axis=1)
df_merged["Difficulty"].head(30)

0     Medium
1       Easy
2       Easy
3       Easy
4       Easy
5     Medium
6       Easy
7       Easy
8       Easy
9       Easy
10      Easy
11      Easy
12    Medium
13      Easy
14      Easy
15      Easy
16      Easy
17      Easy
18      Easy
19      Easy
20      Easy
21    Medium
22    Medium
23      Easy
24      Easy
25      Easy
26      Easy
27      Easy
28      Easy
29    Medium
Name: Difficulty, dtype: object

In [8]:
df_merged["Difficulty"].value_counts()

Difficulty
Easy      358848
Medium    107471
Hard       31998
Name: count, dtype: int64

In [9]:
df_merged["AggregatedRating"].value_counts()

AggregatedRating
5.0    167413
4.0     41303
4.5     33604
3.0      8854
3.5      3849
2.0      1979
1.0      1611
2.5       645
1.5        66
Name: count, dtype: int64

In [None]:
df_merged["AggregatedRating"].isna()value_counts()

In [11]:
df_merged["RecipeInstructions"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 498317 entries, 0 to 498316
Series name: RecipeInstructions
Non-Null Count   Dtype 
--------------   ----- 
498317 non-null  object
dtypes: object(1)
memory usage: 3.8+ MB


In [13]:
# This function uses regex to find all text inside quotes
def clean_steps(steps_str):
    if pd.isna(steps_str):
        return "No instructions provided."
    
    # re.findall(r'"(.*?)"', ...) finds all text between quote marks
    # in a string like 'c("step 1", "step 2")'
    steps_list = re.findall(r'"(.*?)"', str(steps_str))
    
    if not steps_list:
        # If no quotes are found, it might be a simple string.
        # Let's just return the raw text, but clean it a bit.
        clean_str = str(steps_str).strip("[]c()") # Clean up common junk
        if len(clean_str) > 0:
            return clean_str
        else:
            return "No instructions provided."
    
    # Join all the steps with a " | " separator
    return " | ".join(steps_list)

# Apply this function to the 'RecipeInstructions' column
# and save it in a new, clean 'steps_clean' column
df_merged['steps_clean'] = df_merged['RecipeInstructions'].apply(clean_steps)
print(df_merged[['RecipeInstructions', 'steps_clean']].head())

                                  RecipeInstructions  \
0  c("Preheat oven to 375°F.", "Spread cherry pie...   
1  c("Set oven to 350 degrees F.", "Butter a 9 x ...   
2  c("Preheat oven to 350°F  In a mixing bowl, us...   
3  c("In a large mixing bowl, combine the first 6...   
4  c("Cream butter and sugars together.", "Blend ...   

                                         steps_clean  
0  Preheat oven to 375°F. | Spread cherry pie fil...  
1  Set oven to 350 degrees F. | Butter a 9 x 13-i...  
2  Preheat oven to 350°F  In a mixing bowl, using...  
3  In a large mixing bowl, combine the first 6 in...  
4  Cream butter and sugars together. | Blend in m...  


In [14]:
df_merged.head()

Unnamed: 0,id,ingredients,Name,TotalTime,Description,Keywords,AggregatedRating,RecipeInstructions,Difficulty,steps_clean
0,71247,"[""cherry pie filling"", ""condensed milk"", ""melt...",Cherry Streusel Cobbler,60,"I haven't made this in years, so I'm just gues...","c(""Cherries"", ""Fruit"", ""< 60 Mins"", ""Oven"")",,"c(""Preheat oven to 375°F."", ""Spread cherry pie...",Medium,Preheat oven to 375°F. | Spread cherry pie fil...
1,76133,"[""corned beef chopped"", ""sauerkraut cold water...",Reuben and Swiss Casserole Bake,40,I think this is even better than a reuben sand...,"c(""< 60 Mins"", ""Oven"")",5.0,"c(""Set oven to 350 degrees F."", ""Butter a 9 x ...",Easy,Set oven to 350 degrees F. | Butter a 9 x 13-i...
2,503816,"[""unsalted butter"", ""vegetable oil"", ""all - pu...",Yam-Pecan Recipe,75,A lady I work with heard me taking about ZWT a...,"c(""Breads"", ""Nuts"", ""Cajun"", ""< 4 Hours"", ""Easy"")",,"c(""Preheat oven to 350°F In a mixing bowl, us...",Easy,"Preheat oven to 350°F In a mixing bowl, using..."
3,418749,"[""orange cake mix"", ""instant vanilla pudding"",...",Tropical Orange Layer Cake,40,Make and share this Tropical Orange Layer Cak...,"c(""Low Protein"", ""< 60 Mins"", ""For Large Groups"")",,"c(""In a large mixing bowl, combine the first 6...",Easy,"In a large mixing bowl, combine the first 6 in..."
4,392934,"[""butter"", ""brown sugar"", ""granulated sugar"", ...",Safe to Eat Raw Chocolate Chip Oreo Cookie &qu...,15,I was searching the web for something like thi...,"c(""Cookie & Brownie"", ""< 15 Mins"", ""For Large ...",,"c(""Cream butter and sugars together."", ""Blend ...",Easy,Cream butter and sugars together. | Blend in m...


In [17]:
start_time = time.time() # Start timer

# This object that will learn the ingredient vocabulary.
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',    # Ignore common words like 'and', 'the', 'a'
    max_features=10000       # Only learn the top 10,000 most common ingredients (to keep it fast)
)


# We'll fill any 'NaN' ingredient rows with an empty string so the model doesn't crash.
df_merged['ingredients'] = df_merged['ingredients'].fillna('')

# This is the "heavy lifting."
# .fit_transform() does two jobs at once:
# 1. 'fit': It reads ALL ingredient lists to "learn" the vocabulary.
# 2. 'transform': It creates the giant number matrix.
print("Fitting the TF-IDF model to all ingredients: ")
tfidf_matrix = tfidf_vectorizer.fit_transform(df_merged['ingredients'])

end_time = time.time() # End timer
print(f"The shape of our new matrix is: {tfidf_matrix.shape}")

Fitting the TF-IDF model to all ingredients: 
The shape of our new matrix is: (498317, 10000)


In [21]:
# 1. Get the list of all 10,000 feature names (ingredients) from the tfidf matrix
feature_names = tfidf_vectorizer.get_feature_names_out()

# 2. Get the matrix data for the first recipe (row 0)
recipe_vector = tfidf_matrix[0]

# 3. Get the column numbers (indices) of the non-zero scores
non_zero_indices = recipe_vector.indices

# 4. Get the scores for those positions
non_zero_scores = recipe_vector.data

print("Ingredients found in the matrix for recipe 0:\n")
# 5. Loop through them and print the word and its score
for i in range(len(non_zero_indices)):
    index = non_zero_indices[i]
    score = non_zero_scores[i]
    
    # Use the index to look up the word in our feature_names list
    word = feature_names[index]
    
    print(f"Word: '{word}',  Score: {score:.4f}")

Ingredients found in the matrix for recipe 0:

Word: 'cherry',  Score: 0.2474
Word: 'pie',  Score: 0.2284
Word: 'filling',  Score: 0.3198
Word: 'condensed',  Score: 0.2366
Word: 'milk',  Score: 0.1358
Word: 'melted',  Score: 0.2472
Word: 'margarine',  Score: 0.3768
Word: 'cinnamon',  Score: 0.1616
Word: 'nutmeg',  Score: 0.1998
Word: 'light',  Score: 0.2393
Word: 'brown',  Score: 0.1506
Word: 'sugar',  Score: 0.1247
Word: 'flour',  Score: 0.1488
Word: 'chopped',  Score: 0.1094
Word: 'nuts',  Score: 0.2314
Word: 'oats',  Score: 0.2322
Word: 'butter',  Score: 0.1267
Word: 'flavored',  Score: 0.2706
Word: 'cooking',  Score: 0.2389
Word: 'spray',  Score: 0.2397


In [23]:
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)
print("Saved tfidf_vectorizer.pkl")

save_npz("tfidf_matrix.npz", tfidf_matrix)
print("Saved tfidf_matrix.npz")

columns_to_keep = [
    'id',         
    'Name',             
    'Description',      
    'AggregatedRating', 
    'TotalTime',          
    'Difficulty',       
    'steps_clean'       
]


recipes_master_list = df_merged[columns_to_keep]


recipes_master_list = recipes_master_list.rename(columns={
    'Name': 'name',
    'Description': 'description',
    'AggregatedRating': 'rating',
    'steps_clean': 'steps'
})
            
# Save the final CSV
recipes_master_list.to_csv("recipes_master_list.csv", index=False)
print("Saved recipes_master_list.csv")

...Saved tfidf_vectorizer.pkl
...Saved tfidf_matrix.npz
...Saved recipes_master_list.csv
