In [1]:
#Import libraries
import time
import nltk
import string
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from ast import literal_eval

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\irene\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\irene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\irene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
#Combine both Kaggle datasets 
Kaggledf_O = pd.read_csv('..//Data_input//RAW_recipes.csv', delimiter=',')
Kaggledfr_O = pd.read_csv('..//Data_input//RAW_interactions.csv', delimiter=',')
merged_df = pd.merge(Kaggledf_O, Kaggledfr_O, left_on='id', right_on='recipe_id')
average_ratings = merged_df.groupby(['id', 'name'])['rating'].mean().reset_index()
Kaggledf = pd.merge(Kaggledf_O, average_ratings, on=['id', 'name'], how='left')
Kaggledf.rename(columns={'rating': 'average_rating2'}, inplace=True)


In [3]:
#Load DHLab dataset
DHLabdf = pd.read_csv('..//Data_input//DHLabData.csv', delimiter=';')
DHLabdf_translated = pd.read_csv('..//Data_output//translated_DHLabData.csv', delimiter=',')

#Add 'cookingmethodlist' 
def transform_cookingmethods(s):
    if pd.isna(s) or s == '(null)':
        return np.nan    
    else:
        s = s.strip('[]').strip()
        return s
DHLabdf['cookingmethod'] = DHLabdf['cookingmethodlist'].apply(transform_cookingmethods)
DHLabdf_translated['cookingmethod'] = DHLabdf['cookingmethod'] 


In [4]:
#Transform DHLab dataset
DHLabdf_translated.rename(columns={'title': 'Recipe_name1', 'cookingtimelist': 'Cooking_minutes1', 'cookingmethodlist': 'Steps_number1', 'ingredientlist': 'Ingredient_list1', 'n_ingredients': 'Ingredient_number1', 'cookingmethod' : 'cookingmethod1'}, inplace = True)
columns_to_keep = ['Recipe_name1', 'Cooking_minutes1', 'Steps_number1', 'Ingredient_list1', 'Ingredient_number1', 'cookingmethod1']
DHLabdf_translated.drop(columns=[col for col in DHLabdf_translated.columns if col not in columns_to_keep], inplace=True)
DHLabdf_translated['average_rating1'] = float('nan')

In [5]:
#Transform DHLab dataset 
DHLabdf_translated['cookingmethod'] = DHLabdf_translated['cookingmethod1'].str.split(', ')
all_methods = [method for sublist in DHLabdf_translated['cookingmethod'] if isinstance(sublist, list) for method in sublist]
unique_methods = set(all_methods)
unique_methods_list = list(unique_methods)
print("All cooking methods:", unique_methods_list)
DHLabdf_translated.drop(columns=['cookingmethod'], inplace=True)

All cooking methods: ['broil', 'simmer', 'fry', 'score', 'blanch', 'cook', 'heat', 'toast', 'reduce', 'pressure cook', 'saute', 'slow cook', 'bake', 'poach', 'sauté', 'temper', 'microwave', 'stew', 'bbq', 'deep fry', 'chill', 'stir-fry', 'barbeque', 'roast', 'dry roast', 'stir fry', 'boil', 'grill', 'shallow fry', 'steam', 'barbecue']


In [6]:
#Add columns to Kaggle dataset so that they allign with the DHLab dataset
def methods(steps, methods):
    mentioned_methods = []
    for stepss in steps:
        step_list = literal_eval(stepss)
        for step in step_list:
            for method in methods:
                if re.search(r'\b{}\b'.format(method), step):
                    mentioned_methods.append(method)
                    break
    return mentioned_methods

Kaggledf['cookingmethod2'] = Kaggledf['steps'].apply(lambda x: methods([x], unique_methods_list))
Kaggledf['steps_number2'] = Kaggledf['cookingmethod2'].apply(lambda x: len(x))
Kaggledf['cookingmethod2'] = Kaggledf['cookingmethod2'].apply(', '.join)

Kaggledf

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,average_rating2,cookingmethod2,steps_number2
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,5.0,bake,1
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,3.5,"bake, bake",2
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,4.0,cook,1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,4.5,"boil, cook, bake",3
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,5.0,boil,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,zydeco soup,486161,60,227978,2012-08-29,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22,5.0,"heat, cook, cook, simmer",4
231633,zydeco spice mix,493372,5,1500678,2013-01-09,"['15-minutes-or-less', 'time-to-make', 'course...","[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",1,['mix all ingredients together thoroughly'],this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po...",13,5.0,,0
231634,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,5.0,chill,1
231635,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,1.0,bake,1


In [16]:
Kaggledf.iloc[231632]['steps']

"['heat oil in a 4-quart dutch oven', 'add celery , onion , sweet pepper and garlic', 'cook for 5 minutes over medium heat', 'stir in ham , paprika , sugar , dry mustard , cumin , basil , oregano , thyme , cloves , black pepper and cayenne pepper', 'cook for 5 more minutes , stirring frequently', 'stir in black-eyed peas , hominy , undrained tomatoes , chicken broth , parsley and molasses', 'bring to boil , then reduce heat and cover dutch over and let soup simmer for 30 minutes']"

In [7]:
#Prepare Kaggle dataset
Kaggledf.rename(columns={'name': 'Recipe_name2', 'minutes': 'Cooking_minutes2', 'ingredients': 'Ingredient_list2', 'n_ingredients': 'Ingredient_number2', 'cookingmethod2': 'cookingmethod2', 'steps_number2': 'steps_number2'}, inplace = True)
columns_to_keep = ['Recipe_name2', 'Cooking_minutes2', 'Ingredient_list2', 'Ingredient_number2', 'cookingmethod2', 'steps_number2', 'average_rating2']
Kaggledf.drop(columns=[col for col in Kaggledf.columns if col not in columns_to_keep], inplace=True)
Kaggledf['Ingredient_list2'] = Kaggledf['Ingredient_list2'].str.strip("[]").str.replace("'", "")

Kaggledf

Unnamed: 0,Recipe_name2,Cooking_minutes2,Ingredient_list2,Ingredient_number2,average_rating2,cookingmethod2,steps_number2
0,arriba baked winter squash mexican style,55,"winter squash, mexican seasoning, mixed spice,...",7,5.0,bake,1
1,a bit different breakfast pizza,30,"prepared pizza crust, sausage patty, eggs, mil...",6,3.5,"bake, bake",2
2,all in the kitchen chili,130,"ground beef, yellow onions, diced tomatoes, to...",13,4.0,cook,1
3,alouette potatoes,45,"spreadable cheese with garlic and herbs, new p...",11,4.5,"boil, cook, bake",3
4,amish tomato ketchup for canning,190,"tomato juice, apple cider vinegar, sugar, salt...",8,5.0,boil,1
...,...,...,...,...,...,...,...
231632,zydeco soup,60,"celery, onion, green sweet pepper, garlic clov...",22,5.0,"heat, heat, cook, boil",4
231633,zydeco spice mix,5,"paprika, salt, garlic powder, onion powder, dr...",13,5.0,,0
231634,zydeco ya ya deviled eggs,40,"hard-cooked eggs, mayonnaise, dijon mustard, s...",8,5.0,chill,1
231635,cookies by design cookies on a stick,29,"butter, eagle brand condensed milk, light brow...",10,1.0,bake,1


In [8]:
#Most frequent recipe name + counts
recipe_counts = Kaggledfr_O['recipe_id'].value_counts()
mf_recipe_name = recipe_counts.idxmax()
mf_recipe_count = recipe_counts.max()

print("most frequent recipe name:", mf_recipe_name)
print("amount:", mf_recipe_count)

Most frequent recipe name: 2886
Number of occurrences: 1613


In [9]:
#Add Lowercase, tokenization, remove punctuation, remove stopwords, Lemmatization, stemming
def preprocess_text(text):
    if isinstance(text, str): 
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in string.punctuation]
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        return ' '.join(tokens) 
    else:
        return ''

DHLabdf_translated['Recipe_name1'] = DHLabdf_translated['Recipe_name1'].apply(preprocess_text)
Kaggledf['Recipe_name2'] = Kaggledf['Recipe_name2'].apply(preprocess_text)


In [14]:
#Example that uppercase/lowercase, punctuation, stopwords matter
def jaccardtest(word1, word2):
    set1 = set(word1)
    set2 = set(word2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))    
    jaccard_similarity = intersection / union
    return jaccard_similarity

word1 = "Thi is an appl."
word2 = "This is an apple."

similarity = jaccardtest(word1, word2)
print("Jaccard similarity between '{}' and '{}': {:.2f}".format(word1, word2, similarity))


Jaccard similarity between 'Thi is an appl.' and 'This is an apple.': 0.91


In [11]:
print(DHLabdf_translated.isna().sum())
Kaggledf = Kaggledf.dropna(subset=['Recipe_name2', 'average_rating2']) #there is 1 recipe without name and rating so drop
print(Kaggledf.isna().sum())


Recipe_name1               0
Cooking_minutes1      190423
Steps_number1         190423
Ingredient_list1      169736
Ingredient_number1    169736
cookingmethod1        190424
average_rating1       252212
dtype: int64
Recipe_name2          0
Cooking_minutes2      0
Ingredient_list2      0
Ingredient_number2    0
average_rating2       0
cookingmethod2        0
steps_number2         0
dtype: int64


Make supervised, unsupervised and compare results of similarity measures

In [12]:
#Drop nan
DHLdf_without_nan = DHLabdf_translated.dropna(subset=['Cooking_minutes1'])
#Add nan
DHLdf_nan = DHLdf_without_nan.copy()
DHLdf_nan.iloc[:, 1:] = float('nan')
#Set sample 
DHLdf_whole = DHLabdf_translated
DHLdf_nan_part = DHLdf_nan[0:300]
Kaggledf_filled = Kaggledf[0:1000]
DHLdf_without_nan_part = DHLdf_without_nan[0:300]


In [13]:
#Check/print total number of stopwords in the column
stop_words = set(stopwords.words('english'))
def count_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stopword_count = sum(1 for word in tokens if word.lower() in stop_words)
    return stopword_count
stopword_counts = DHLabdf_translated['Recipe_name1'].apply(count_stopwords)
total_stopwords = stopword_counts.sum()

print(total_stopwords)

Total number of stopwords in the column: 577


Jaccard similarity

In [None]:
#Jaccard similarity
def jaccard(df1, df2):
    results = []
    for index, row in tqdm(df1[df1['Cooking_minutes1'].isna()].iterrows(), total=len(df1[df1['Cooking_minutes1'].isna()])):        
        #Get recipe name from DHLabdata
        recipe_name = row['Recipe_name1']
        #Calculate Jaccard similarity between recipe names
        jaccard_similarities = [(fuzz.token_set_ratio(recipe_name, name), name) for name in df2['Recipe_name2']]
        #Find recipe name with the highest Jaccard similarity
        closest_match_tuple = max(jaccard_similarities)
        #Extract the closest match from the tuple
        closest_match = closest_match_tuple[1]
        #Get index of closest match
        closest_index = df2[df2['Recipe_name2'] == closest_match].index[0]
        #Store the matched data
        result_row = {
            'Recipe_name1': recipe_name,
            'Cooking_minutes1': df2.at[closest_index, 'Cooking_minutes2'] if pd.isna(row['Cooking_minutes1']) else row['Cooking_minutes1'],
            'Steps_number1': df2.at[closest_index, 'steps_number2'] if pd.isna(row['Steps_number1']) else row['Steps_number1'],
            'cookingmethod1': df2.at[closest_index, 'cookingmethod2'] if pd.isna(row['cookingmethod1']) else row['cookingmethod1'],
            'Ingredient_list1': df2.at[closest_index, 'Ingredient_list2'] if pd.isna(row['Ingredient_list1']) else row['Ingredient_list1'],
            'Ingredient_number1': df2.at[closest_index, 'Ingredient_number2'] if pd.isna(row['Ingredient_number1']) else row['Ingredient_number1'],
            'average_rating1': df2.at[closest_index, 'average_rating2'] if pd.isna(row['average_rating1']) else row['average_rating1']
        }
        results.append(result_row)
        

    return pd.DataFrame(results)

#Print time
start_time = time.time()
DHLdf_jaccard = jaccard(DHLdf_nan_part, Kaggledf_filled)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken)

#mae, mse, rmse
mae = mean_absolute_error(DHLdf_without_nan_part['Predicted_Score'], DHLdf_jaccard['Predicted_Score'])
mse = mean_squared_error(DHLdf_without_nan_part['Predicted_Score'], DHLdf_jaccard['Predicted_Score'])
rmse = mse ** 0.5
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


100%|██████████| 300/300 [00:37<00:00,  7.92it/s]

Time taken: 37.90108585357666
Mean Absolute Error (MAE) for time_to_make: 1513.3766666666668
Mean Squared Error (MSE) for time_to_make: 62125534.07666667
Root Mean Squared Error (RMSE) for time_to_make: 7881.975264910863





Cosine similarity

In [None]:
#Cosine similarity
def cosine_score(text1, text2):
    vectorizer = CountVectorizer()
    count_matrix = vectorizer.fit_transform([text1, text2])
    cosine_similarities = cosine_similarity(count_matrix[0:1], count_matrix)
    return cosine_similarities[0][1]

def cosine(df1, df2):
    results = []
    for index, row in tqdm(df1.iterrows(), total=len(df1)):
        #Get recipe name from DHLabdata
        recipe_name = row['Recipe_name1']
        if pd.isna(recipe_name):
            continue
        #Calculate cosine similarities between recipe names
        cosine_similarities = [(
            cosine_score(recipe_name, name),
            name
        ) for name in df2['Recipe_name2']]
        #Find recipe name with the highest cosine similarity
        closest_match_tuple = max(cosine_similarities)
        #Extract the closest match from the tuple
        closest_match = closest_match_tuple[1]
        #Get index of closest match
        closest_index = df2[df2['Recipe_name2'] == closest_match].index[0]
        #Store the matched data
        result_row = {
            'Recipe_name1': recipe_name,
            'Cooking_minutes1': df2.at[closest_index, 'Cooking_minutes2'] if pd.isna(row['Cooking_minutes1']) else row['Cooking_minutes1'],
            'Steps_number1': df2.at[closest_index, 'steps_number2'] if pd.isna(row['Steps_number1']) else row['Steps_number1'],
            'cookingmethod1': df2.at[closest_index, 'cookingmethod2'] if pd.isna(row['cookingmethod1']) else row['cookingmethod1'],
            'Ingredient_list1': df2.at[closest_index, 'Ingredient_list2'] if pd.isna(row['Ingredient_list1']) else row['Ingredient_list1'],
            'Ingredient_number1': df2.at[closest_index, 'Ingredient_number2'] if pd.isna(row['Ingredient_number1']) else row['Ingredient_number1'],
            'average_rating1': df2.at[closest_index, 'average_rating2'] if pd.isna(row['average_rating1']) else row['average_rating1']
        }
        results.append(result_row)
    return pd.DataFrame(results)

#Print time
start_time = time.time()
DHLdf_cosine = cosine(DHLdf_nan_part, Kaggledf_filled)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken)

#mae, mse, rmse
mae = mean_absolute_error(DHLdf_without_nan_part['Cooking_minutes1'], DHLdf_cosine['Cooking_minutes1'])
mse = mean_squared_error(DHLdf_without_nan_part['Cooking_minutes1'], DHLdf_cosine['Cooking_minutes1'])
rmse = mse ** 0.5
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


100%|██████████| 300/300 [05:19<00:00,  1.07s/it]

Time taken: 319.77775526046753
Mean Absolute Error (MAE) for time_to_make: 379.5133333333333
Mean Squared Error (MSE) for time_to_make: 12471888.346666666
Root Mean Squared Error (RMSE) for time_to_make: 3531.556080068199





LSA

In [None]:
#LSA similarity
def LSA_score(text1, text2):
    vectorizer = CountVectorizer()
    count_matrix = vectorizer.fit_transform([text1, text2])    
    #Perform Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=min(count_matrix.shape)-1)
    svd.fit(count_matrix)
    transformed_matrix = svd.transform(count_matrix)    
    #Calculate cosine similarity between the transformed vectors
    cosine_similarities = cosine_similarity(transformed_matrix[0:1], transformed_matrix)
    return cosine_similarities[0][1]

def LSA(df1, df2):
    results = []
    for index, row in tqdm(df1.iterrows(), total=len(df1)):  
        #Get recipe name from DHLabdata
        recipe_name = row['Recipe_name1']
        #Calculate cosine similarities between recipe names
        cosine_similarities = [(
            LSA_score(recipe_name, name),
            name
        ) for name in df2['Recipe_name2']]
        #Find recipe name with the highest cosine similarity
        closest_match_tuple = max(cosine_similarities)
        #Extract the closest match from the tuple
        closest_match = closest_match_tuple[1]
        #Get index of closest match
        closest_index = df2[df2['Recipe_name2'] == closest_match].index[0]
        #Store the matched data
        result_row = {
            'Recipe_name1': recipe_name,
            'Cooking_minutes1': df2.at[closest_index, 'Cooking_minutes2'] if pd.isna(row['Cooking_minutes1']) else row['Cooking_minutes1'],
            'Steps_number1': df2.at[closest_index, 'steps_number2'] if pd.isna(row['Steps_number1']) else row['Steps_number1'],
            'cookingmethod1': df2.at[closest_index, 'cookingmethod2'] if pd.isna(row['cookingmethod1']) else row['cookingmethod1'],
            'Ingredient_list1': df2.at[closest_index, 'Ingredient_list2'] if pd.isna(row['Ingredient_list1']) else row['Ingredient_list1'],
            'Ingredient_number1': df2.at[closest_index, 'Ingredient_number2'] if pd.isna(row['Ingredient_number1']) else row['Ingredient_number1'],
            'average_rating1': df2.at[closest_index, 'average_rating2'] if pd.isna(row['average_rating1']) else row['average_rating1']
        }
        results.append(result_row)
    return pd.DataFrame(results)

#Print time
start_time = time.time()
DHLdf_lsa = LSA(DHLdf_nan_part, Kaggledf_filled)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken)

#mae, mse, rmse
mae = mean_absolute_error(DHLdf_without_nan_part['Cooking_minutes1'], DHLdf_lsa['Cooking_minutes1'])
mse = mean_squared_error(DHLdf_without_nan_part['Cooking_minutes1'], DHLdf_lsa['Cooking_minutes1'])
rmse = mse ** 0.5
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


100%|██████████| 300/300 [09:35<00:00,  1.92s/it]

Time taken: 575.0852868556976
Mean Absolute Error (MAE) for time_to_make: 76.11333333333333
Mean Squared Error (MSE) for time_to_make: 30387.946666666667
Root Mean Squared Error (RMSE) for time_to_make: 174.32138901083442





WordNet similarity

In [None]:

def WordNet_score(text1, text2):
    #Tokenize the recipe names
    tokens1 = nltk.word_tokenize(text1)
    tokens2 = nltk.word_tokenize(text2)
    #Calculate maximum similarity between words in the two texts
    max_similarity = 0
    for token1 in tokens1:
        for token2 in tokens2:
            synsets1 = wn.synsets(token1)
            synsets2 = wn.synsets(token2)
            if synsets1 and synsets2:
                #Calculate the maximum similarity between synsets
                max_sim = max(s1.path_similarity(s2) or 0 for s1 in synsets1 for s2 in synsets2)
                if max_sim > max_similarity:
                    max_similarity = max_sim
    return max_similarity

def WordNet(df1, df2):
    results = []
    for index, row in tqdm(df1[df1['Cooking_minutes1'].isna()].iterrows(), total=len(df1[df1['Cooking_minutes1'].isna()])):        
        #Get recipe name from DHLabdata
        recipe_name = row['Recipe_name1']
        #Calculate WordNet similarity between recipe names
        wordnet_similarities = [(
            WordNet_score(recipe_name, name),
            name
        ) for name in df2['Recipe_name2']]
        #Find recipe name with the highest WordNet similarity
        closest_match_tuple = max(wordnet_similarities)
        #Extract the closest match from the tuple
        closest_match = closest_match_tuple[1]
        #Get index of closest match
        closest_index = df2[df2['Recipe_name2'] == closest_match].index[0]
        #Store the matched data
        result_row = {
            'Recipe_name1': recipe_name,
            'Cooking_minutes1': df2.at[closest_index, 'Cooking_minutes2'] if pd.isna(row['Cooking_minutes1']) else row['Cooking_minutes1'],
            'Steps_number1': df2.at[closest_index, 'steps_number2'] if pd.isna(row['Steps_number1']) else row['Steps_number1'],
            'cookingmethod1': df2.at[closest_index, 'cookingmethod2'] if pd.isna(row['cookingmethod1']) else row['cookingmethod1'],
            'Ingredient_list1': df2.at[closest_index, 'Ingredient_list2'] if pd.isna(row['Ingredient_list1']) else row['Ingredient_list1'],
            'Ingredient_number1': df2.at[closest_index, 'Ingredient_number2'] if pd.isna(row['Ingredient_number1']) else row['Ingredient_number1'],
            'average_rating1': df2.at[closest_index, 'average_rating2'] if pd.isna(row['average_rating1']) else row['average_rating1']
        }
        results.append(result_row)
    return pd.DataFrame(results)

#Print time
start_time = time.time()
DHLdf_wordnet = WordNet(DHLdf_nan_part, Kaggledf_filled)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken)

#mae, mse, rmse
mae = mean_absolute_error(DHLdf_without_nan_part['Cooking_minutes1'], DHLdf_wordnet['Cooking_minutes1'])
mse = mean_squared_error(DHLdf_without_nan_part['Cooking_minutes1'], DHLdf_wordnet['Cooking_minutes1'])
rmse = mse ** 0.5
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


100%|██████████| 300/300 [1:13:56<00:00, 14.79s/it]


Time taken: 4436.2573981285095
Mean Absolute Error (MAE) for time_to_make: 80.32
Mean Squared Error (MSE) for time_to_make: 31425.8
Root Mean Squared Error (RMSE) for time_to_make: 177.2732354305071


Final

In [14]:
def cosine_score(text1, text2):
    vectorizer = CountVectorizer()
    count_matrix = vectorizer.fit_transform([text1, text2])
    cosine_similarities = cosine_similarity(count_matrix[0:1], count_matrix)
    return cosine_similarities[0][1]

def cosine(df1, df2):
    results = []
    for index, row in tqdm(df1.iterrows(), total=len(df1)):
        #Get recipe name from DHLabdata
        recipe_name = row['Recipe_name1']
        if pd.isna(recipe_name):
            continue
        #Calculate cosine similarities between recipe names
        cosine_similarities = [(
            cosine_score(recipe_name, name),
            name
        ) for name in df2['Recipe_name2']]
        #Find recipe name with the highest cosine similarity
        closest_match_tuple = max(cosine_similarities)
        #Extract the closest match from the tuple
        closest_match = closest_match_tuple[1]
        #Get index of closest match
        closest_index = df2[df2['Recipe_name2'] == closest_match].index[0]
        #Store the matched data
        result_row = {
            'Recipe_name1': recipe_name,
            'Cooking_minutes1': df2.at[closest_index, 'Cooking_minutes2'] if pd.isna(row['Cooking_minutes1']) else row['Cooking_minutes1'],
            'Steps_number1': df2.at[closest_index, 'steps_number2'] if pd.isna(row['Steps_number1']) else row['Steps_number1'],
            'cookingmethod1': df2.at[closest_index, 'cookingmethod2'] if pd.isna(row['cookingmethod1']) else row['cookingmethod1'],
            'Ingredient_list1': df2.at[closest_index, 'Ingredient_list2'] if pd.isna(row['Ingredient_list1']) else row['Ingredient_list1'],
            'Ingredient_number1': df2.at[closest_index, 'Ingredient_number2'] if pd.isna(row['Ingredient_number1']) else row['Ingredient_number1'],
            'average_rating1': df2.at[closest_index, 'average_rating2'] if pd.isna(row['average_rating1']) else row['average_rating1']
        }
        results.append(result_row)
    return pd.DataFrame(results)


#Print time
start_time = time.time()
DHLdf_cosine = cosine(DHLdf_whole, Kaggledf_filled)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken)

100%|██████████| 252212/252212 [84:34:12<00:00,  1.21s/it]         


Time taken: 304456.6202676296


In [35]:
#save to DHLdf_cosine
DHLdf_o = pd.read_csv('..//Data_output//translated_DHLabData.csv', delimiter=',')

DHLdf_o['cookingtimelist'] = DHLdf_cosine['Cooking_minutes1']
DHLdf_o['steps_number'] = DHLdf_cosine['Steps_number1']
DHLdf_o['ingredientlist'] = DHLdf_cosine['Ingredient_list1']
DHLdf_o['n_ingredients'] = DHLdf_cosine['Ingredient_number1']
DHLdf_o['cookingmethod'] = DHLdf_cosine['cookingmethod1']
DHLdf_o['rating'] = DHLdf_cosine['average_rating1']
DHLdf_o = DHLdf_o.drop('cookingmethodlist', axis=1)
DHLdf_o['cookingmethod'] = DHLdf_o['cookingmethod'].fillna('nan')

DHLdf_o.to_csv('..//Data_output//DHLdf_cosine.csv', index=False)