In [1]:
import pandas as pd
import numpy as np
import sklearn
from gensim import corpora
from gensim import models
from gensim import similarities
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq

In [2]:
df_ingre = pd.read_csv('recipe_dataset/db/recipe_ingre.csv')
df_basic = pd.read_csv('recipe_dataset/db/recipe_basic.csv', encoding="cp949")

df_tmp=df_ingre[['RECIPE_ID','IRDNT_NM']]
df_tmp_arr=[[]for i in range(538)]
num=0
for i in df_tmp['IRDNT_NM'] :
    df_tmp_arr[df_tmp['RECIPE_ID'][num]].append(i)
    num+=1
    
df_tmp_arr.pop(0)
num=0
for i in df_tmp_arr :
    df_tmp_arr[num] = list(set(df_tmp_arr[num]))
    num+=1
    
ingre_dict = {}

for i in range(1, 538, 1):
    ingre_dict[str(i)] = df_tmp_arr[i-1]

In [3]:
bags_of_words = [ Counter(IRDNT_NM) for IRDNT_NM in df_tmp_arr ]

In [4]:
#Find sum of every ingredient using Counter()
sumbags = sum(bags_of_words, Counter())


# Finally, plot the 10 most used ingredients
clean_df = pd.DataFrame.from_dict(sumbags, orient='index').reset_index()
clean_df = clean_df.rename(columns={'index':'IRDNT_NM', 0:'count'})

top_ing = clean_df.sort_values('count', ascending=False)

In [5]:
ingr_only_dict = clean_df['IRDNT_NM'].to_dict()

In [6]:
#Get all clean ingredients in list format per recipe
ingr_list = []
for IRDNT_NM in df_tmp_arr:
    ingr_list.append(IRDNT_NM)

In [7]:
##Get clean ingredients to be used as an input for word2vec model to identify ingredient similarity.

ingr_clean_df = pd.DataFrame({'IRDNT_NM':ingr_list})

In [8]:
#Create a dictionary for all the ingredients in the recipe list

dictionary = corpora.Dictionary(ingr_list)

In [9]:
#Applying doc2bow on the dictionary of ingredients, which converts the ingredient to a number in every recipe
#This input format is needed for TfIdfmodel
bow_corpus = [dictionary.doc2bow(text) for text in ingr_list]

In [10]:
# train the model
tfidf = models.TfidfModel(bow_corpus)

In [11]:
corpus_tfidf = tfidf[bow_corpus]

In [12]:
#Use similarities library from gensim to get the cosine similarity of the tfidf results

index = similarities.MatrixSimilarity(tfidf[bow_corpus])
index.save('ingr.index')
index = similarities.MatrixSimilarity.load('ingr.index')

sims = index[corpus_tfidf]
sims_list = [(i,j) for i,j in enumerate(sims)]

#Creating a list to hold the cosine similarity results for tfidf
tf_idf_list = []

for i,j in enumerate(sims_list):
    tf_idf_list.append(sims_list[i][1])

In [13]:
#Create recipe dict- to be used in creating dataframe in next step - used to decode recipe id
recipe_dict = {k: v for k, v in enumerate(df_basic.RECIPE_NM_KO)}

In [14]:
#Use cosine similarity results to get the top 10 similar recipes for every recipe.
tf_idf_top  = []
similar_recipes_df = pd.DataFrame([])
same_item = []

#Get only top 11 largest values from the tf_idf_list - 1 recipe will be the same as itself (hence 12)
for i,item in enumerate(tf_idf_list):
    tf_idf_top.append(heapq.nlargest(11,enumerate(item), key=lambda x: x[1]))

#Remove the recipe value with 1.0 similarity - since it is the same recipe
for i,list_item in enumerate(tf_idf_top):
    for j,k in enumerate(list_item):
        if tf_idf_top[i][j][1] != 1.0:
            similar_recipes_df = similar_recipes_df.append(pd.DataFrame({'Similar_Recipe_ID': recipe_dict.get(tf_idf_top[i][j][0]),'TF-IDF Value': tf_idf_top[i][j][1],'Recipe_ID': recipe_dict.get(i)}, index=[0]), ignore_index=True)

In [15]:
similar_recipes_df = similar_recipes_df[similar_recipes_df['Recipe_ID'] != similar_recipes_df['Similar_Recipe_ID']]

In [16]:
#Create cosine similarity matrix for all recipes 27637*27637
#Since this is a huge matrix, the top 10 similar recipe logic is a better option.

names = [i for i in range(1,len(tf_idf_list))]
final_df = pd.DataFrame.from_dict(zip(names,tf_idf_list))

In [23]:
df_basic['RECIPE_ID'] = df_basic.index+1
recipe_name_df = df_basic[['RECIPE_ID','RECIPE_NM_KO']]
final_df['RECIPE_ID'] = final_df.index+1

recipe_tf_idf_df = final_df.merge(recipe_name_df,how='left', left_on='RECIPE_ID', right_on='RECIPE_ID')

Unnamed: 0,0,1,RECIPE_ID,RECIPE_NM_KO
0,1,"[0.99999994, 0.0018554941, 0.171482, 0.2716229...",1,나물비빔밥
1,2,"[0.0018554941, 0.99999994, 0.0025249422, 0.003...",2,오곡밥
2,3,"[0.171482, 0.0025249422, 0.99999994, 0.1424119...",3,잡채밥
3,4,"[0.27162293, 0.0032559787, 0.14241198, 1.0, 0....",4,콩나물밥
4,5,"[0.014697531, 0.09412046, 0.0082202945, 0.0106...",5,약식
...,...,...,...,...
531,532,"[0.12739867, 0.0016621039, 0.3002445, 0.021962...",532,배추만두
532,533,"[0.0017570321, 0.0017188772, 0.0023909558, 0.0...",533,식빵고구마파이
533,534,"[0.07703272, 0.0, 0.07409936, 0.09555306, 0.0,...",534,카레토마토달걀볶음밥
534,535,"[0.07008466, 0.0029980952, 0.0041703465, 0.005...",535,쪽파 새우강회


In [18]:
#Create a list from tfidf results
#This will be used to identify ingredient importance within every recipe

corpus_list = []
for doc in corpus_tfidf:
    corpus_list.append(doc)

In [19]:
#Create a flat list to eliminate repetition of ingredients and create a dict to hold the results

flat_list = []
for sublist in ingr_list:
    for item in sublist:
        if item not in flat_list:
            flat_list.append(item)
ing_dict =  {k: v for k, v in enumerate(flat_list)}

In [20]:
#Create a dataframe with tf-idf values per ingredient for every recipe.
corpus_df = pd.DataFrame([])

for i,list_item in enumerate(corpus_list):
    for j,k in enumerate(list_item):
        corpus_df = corpus_df.append(pd.DataFrame({'IRDNT_NM': dictionary.get(corpus_list[i][j][0]),'TF-IDF Value': corpus_list[i][j][1],'Recipe_ID': i}, index=[0]), ignore_index=True)

In [25]:
df_basic['RECIPE_ID'] = df_basic.index
recipe_tf_idf_df = corpus_df.merge(df_basic,how='left', left_on='Recipe_ID', right_on='RECIPE_ID')
recipe_tf_idf_df
# recipe_tf_idf_df = recipe_tf_idf_df[['Recipe_ID','RECIPE_NM_KO','IRDNT_NM','TF-IDF Value']]
# tf_idf_sorting = recipe_tf_idf_df.sort_values(by=['Recipe_ID', 'IRDNT_NM'], axis=0)
# tf_idf_sorting.set_index('Recipe_ID', inplace=True)
# tf_idf_sorting.index = tf_idf_sorting.index+1
# tf_idf_sorting.reset_index(inplace=True)
# tf_idf_sorting.to_csv('recipe_dataset/tf_idf_sorting.csv', encoding="utf-8-sig")

Unnamed: 0,IRDNT_NM_x,TF-IDF Value,Recipe_ID,RECIPE_ID,RECIPE_NM_KO,SUMRY,IRDNT_NM_y,NATION_NM,TY_NM,LEVEL_NM,IRDNT_CODE,IMG_URL
0,간장,0.069606,0,0,나물비빔밥,육수로 지은 밥에 야채를 듬뿍 넣은 영양만점 나물비빔밥!,"숙주, 쌀, 소금, 쇠고기, 계란, 다진파, 고사리, 돼지고기, 묵, 미나리, 참기...",한식,밥,보통,곡류,http://file.okdab.com/UserFiles/searching/reci...
1,계란,0.160771,0,0,나물비빔밥,육수로 지은 밥에 야채를 듬뿍 넣은 영양만점 나물비빔밥!,"숙주, 쌀, 소금, 쇠고기, 계란, 다진파, 고사리, 돼지고기, 묵, 미나리, 참기...",한식,밥,보통,곡류,http://file.okdab.com/UserFiles/searching/reci...
2,고사리,0.370313,0,0,나물비빔밥,육수로 지은 밥에 야채를 듬뿍 넣은 영양만점 나물비빔밥!,"숙주, 쌀, 소금, 쇠고기, 계란, 다진파, 고사리, 돼지고기, 묵, 미나리, 참기...",한식,밥,보통,곡류,http://file.okdab.com/UserFiles/searching/reci...
3,고추장,0.185836,0,0,나물비빔밥,육수로 지은 밥에 야채를 듬뿍 넣은 영양만점 나물비빔밥!,"숙주, 쌀, 소금, 쇠고기, 계란, 다진파, 고사리, 돼지고기, 묵, 미나리, 참기...",한식,밥,보통,곡류,http://file.okdab.com/UserFiles/searching/reci...
4,다진마늘,0.088104,0,0,나물비빔밥,육수로 지은 밥에 야채를 듬뿍 넣은 영양만점 나물비빔밥!,"숙주, 쌀, 소금, 쇠고기, 계란, 다진파, 고사리, 돼지고기, 묵, 미나리, 참기...",한식,밥,보통,곡류,http://file.okdab.com/UserFiles/searching/reci...
...,...,...,...,...,...,...,...,...,...,...,...,...
5557,밀가루,0.202536,536,536,콩비지동그랑땡,두부대신 콩비지를 넣어 만든 동그랑땡 맛도 좋아요!,"전분, 소금, 부침가루, 돼지고기, 계란, 참기름, 다진마늘, 밀가루, 대파, 호박...",한식,부침,보통,콩류,http://file.okdab.com/recipe/14829957726840013...
5558,대파,0.127456,536,536,콩비지동그랑땡,두부대신 콩비지를 넣어 만든 동그랑땡 맛도 좋아요!,"전분, 소금, 부침가루, 돼지고기, 계란, 참기름, 다진마늘, 밀가루, 대파, 호박...",한식,부침,보통,콩류,http://file.okdab.com/recipe/14829957726840013...
5559,배추,0.283980,536,536,콩비지동그랑땡,두부대신 콩비지를 넣어 만든 동그랑땡 맛도 좋아요!,"전분, 소금, 부침가루, 돼지고기, 계란, 참기름, 다진마늘, 밀가루, 대파, 호박...",한식,부침,보통,콩류,http://file.okdab.com/recipe/14829957726840013...
5560,콩비지,0.497114,536,536,콩비지동그랑땡,두부대신 콩비지를 넣어 만든 동그랑땡 맛도 좋아요!,"전분, 소금, 부침가루, 돼지고기, 계란, 참기름, 다진마늘, 밀가루, 대파, 호박...",한식,부침,보통,콩류,http://file.okdab.com/recipe/14829957726840013...


In [None]:
tf_idf_df = pd.read_csv('recipe_dataset/tf_idf_sorting.csv')
tf_idf_df = tf_idf_df.drop(['Unnamed: 0'], axis = 1)
tf_idf_df = pd.concat([tf_idf_df, df_ingre["SCORE"]], axis = 1)
tfidf_weight = pd.DataFrame(tf_idf_df['TF-IDF Value'].mul(tf_idf_df['SCORE'], axis=0, level=None, fill_value=None))
tf_idf_df.insert(3, 'TF-IDF_WEIGHT',tfidf_weight)
tf_idf_df

In [None]:
df_idf_df.to_csv('tf_idf_df.csv', encoding="utf-8-sig")