In [1]:
import pandas as pd
import numpy as np
import sklearn
from gensim import corpora
from gensim import models
from gensim import similarities
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq

In [45]:
#df_ingre = pd.read_csv('recipe_dataset/recipe_ingre.csv')
#df = df_ingre.drop_duplicates(subset = ['RECIPE_ID', 'IRDNT_NM'])
#df.to_csv('recipe_dataset/recipe_ingre2.csv', encoding="utf-8-sig")

In [2]:
df_ingre = pd.read_csv('recipe_dataset/recipe_ingre.csv')
df_basic = pd.read_csv('recipe_dataset/recipe_basic.csv', encoding="cp949")

df_tmp=df_ingre[['RECIPE_ID','IRDNT_NM']]
df_tmp_arr=[[]for i in range(538)]
num=0
for i in df_tmp['IRDNT_NM'] :
    df_tmp_arr[df_tmp['RECIPE_ID'][num]].append(i)
    num+=1
    
df_tmp_arr.pop(0)
num=0
for i in df_tmp_arr :
    df_tmp_arr[num] = list(set(df_tmp_arr[num]))
    num+=1
    
ingre_dict = {}

for i in range(1, 538, 1):
    ingre_dict[str(i)] = df_tmp_arr[i-1]

In [3]:
len(ingre_dict)

537

In [4]:
bags_of_words = [ Counter(IRDNT_NM) for IRDNT_NM in df_tmp_arr ]

In [5]:
len(bags_of_words)

537

In [6]:
#Find sum of every ingredient using Counter()
sumbags = sum(bags_of_words, Counter())


# Finally, plot the 10 most used ingredients
clean_df = pd.DataFrame.from_dict(sumbags, orient='index').reset_index()
clean_df = clean_df.rename(columns={'index':'ingredient', 0:'count'})

top_ing = clean_df.sort_values('count', ascending=False)

In [7]:
top_ing

Unnamed: 0,ingredient,count
11,소금,332
7,간장,249
9,설탕,214
29,후춧가루,210
15,다진마늘,203
...,...,...
320,고들빼기,1
164,총각무,1
322,참나물,1
325,완두콩통조림,1


In [8]:
ingr_only_dict = clean_df['ingredient'].to_dict()

In [9]:
#Get all clean ingredients in list format per recipe
ingr_list = []
for IRDNT_NM in df_tmp_arr:
    ingr_list.append(IRDNT_NM)

In [10]:
len(ingr_list)

537

In [11]:
##Get clean ingredients to be used as an input for word2vec model to identify ingredient similarity.

ingr_clean_df = pd.DataFrame({'Ingredient':ingr_list})

In [12]:
#Create a dictionary for all the ingredients in the recipe list

dictionary = corpora.Dictionary(ingr_list)

In [13]:
#Applying doc2bow on the dictionary of ingredients, which converts the ingredient to a number in every recipe
#This input format is needed for TfIdfmodel
bow_corpus = [dictionary.doc2bow(text) for text in ingr_list]

In [14]:
# train the model
tfidf = models.TfidfModel(bow_corpus)

In [15]:
corpus_tfidf = tfidf[bow_corpus]

In [16]:
#Use similarities library from gensim to get the cosine similarity of the tfidf results

index = similarities.MatrixSimilarity(tfidf[bow_corpus])
index.save('ingr.index')
index = similarities.MatrixSimilarity.load('ingr.index')

sims = index[corpus_tfidf]
sims_list = [(i,j) for i,j in enumerate(sims)]

#Creating a list to hold the cosine similarity results for tfidf
tf_idf_list = []

for i,j in enumerate(sims_list):
    tf_idf_list.append(sims_list[i][1])

In [17]:
#Create recipe dict- to be used in creating dataframe in next step - used to decode recipe id
recipe_dict = {k: v for k, v in enumerate(df_basic.RECIPE_NM_KO)}

In [18]:
#Use cosine similarity results to get the top 10 similar recipes for every recipe.
tf_idf_top  = []
similar_recipes_df = pd.DataFrame([])
same_item = []

#Get only top 11 largest values from the tf_idf_list - 1 recipe will be the same as itself (hence 12)
for i,item in enumerate(tf_idf_list):
    tf_idf_top.append(heapq.nlargest(11,enumerate(item), key=lambda x: x[1]))

#Remove the recipe value with 1.0 similarity - since it is the same recipe
for i,list_item in enumerate(tf_idf_top):
    for j,k in enumerate(list_item):
        if tf_idf_top[i][j][1] != 1.0:
            similar_recipes_df = similar_recipes_df.append(pd.DataFrame({'Similar_Recipe_ID': recipe_dict.get(tf_idf_top[i][j][0]),'TF-IDF Value': tf_idf_top[i][j][1],'Recipe_ID': recipe_dict.get(i)}, index=[0]), ignore_index=True)

In [19]:
similar_recipes_df = similar_recipes_df[similar_recipes_df['Recipe_ID'] != similar_recipes_df['Similar_Recipe_ID']]

In [20]:
#Create cosine similarity matrix for all recipes 27637*27637
#Since this is a huge matrix, the top 10 similar recipe logic is a better option.

names = [i for i in range(1,len(tf_idf_list))]
final_df = pd.DataFrame.from_dict(zip(names,tf_idf_list))

In [21]:
final_df

Unnamed: 0,0,1
0,1,"[1.0000001, 0.0017785624, 0.11558835, 0.333323..."
1,2,"[0.0017785624, 1.0, 0.0024592949, 0.002959687,..."
2,3,"[0.11558835, 0.0024592949, 1.0, 0.04783988, 0...."
3,4,"[0.3333232, 0.002959687, 0.04783988, 1.0, 0.00..."
4,5,"[0.013954182, 0.12799355, 0.007930435, 0.00954..."
...,...,...
531,532,"[0.12392111, 0.0016378999, 0.2967598, 0.020259..."
532,533,"[0.0017090708, 0.0016938464, 0.002363206, 0.00..."
533,534,"[0.030211665, 0.0, 0.093179785, 0.014064255, 0..."
534,535,"[0.068171576, 0.0029544362, 0.0041219443, 0.00..."


In [22]:
df_basic['RECIPE_ID'] = df_basic.index+1
recipe_name_df = df_basic[['RECIPE_ID','RECIPE_NM_KO']]
final_df['RECIPE_ID'] = final_df.index+1

recipe_tf_idf_df = final_df.merge(recipe_name_df,how='left', left_on='RECIPE_ID', right_on='RECIPE_ID')

In [23]:
recipe_tf_idf_df

Unnamed: 0,0,1,RECIPE_ID,RECIPE_NM_KO
0,1,"[1.0000001, 0.0017785624, 0.11558835, 0.333323...",1,나물비빔밥
1,2,"[0.0017785624, 1.0, 0.0024592949, 0.002959687,...",2,오곡밥
2,3,"[0.11558835, 0.0024592949, 1.0, 0.04783988, 0....",3,잡채밥
3,4,"[0.3333232, 0.002959687, 0.04783988, 1.0, 0.00...",4,콩나물밥
4,5,"[0.013954182, 0.12799355, 0.007930435, 0.00954...",5,약식
...,...,...,...,...
531,532,"[0.12392111, 0.0016378999, 0.2967598, 0.020259...",532,배추만두
532,533,"[0.0017090708, 0.0016938464, 0.002363206, 0.00...",533,식빵고구마파이
533,534,"[0.030211665, 0.0, 0.093179785, 0.014064255, 0...",534,카레토마토달걀볶음밥
534,535,"[0.068171576, 0.0029544362, 0.0041219443, 0.00...",535,쪽파 새우강회


In [24]:
#Create a list from tfidf results
#This will be used to identify ingredient importance within every recipe

corpus_list = []
for doc in corpus_tfidf:
    corpus_list.append(doc)

In [25]:
#Create a flat list to eliminate repetition of ingredients and create a dict to hold the results

flat_list = []
for sublist in ingr_list:
    for item in sublist:
        if item not in flat_list:
            flat_list.append(item)
ing_dict =  {k: v for k, v in enumerate(flat_list)}

In [26]:
#Create a dataframe with tf-idf values per ingredient for every recipe.
corpus_df = pd.DataFrame([])

for i,list_item in enumerate(corpus_list):
    for j,k in enumerate(list_item):
        corpus_df = corpus_df.append(pd.DataFrame({'Ingredient': dictionary.get(corpus_list[i][j][0]),'TF-IDF Value': corpus_list[i][j][1],'Recipe_ID': i}, index=[0]), ignore_index=True)

In [38]:
df_basic['RECIPE_ID'] = df_basic.index
recipe_tf_idf_df = corpus_df.merge(df_basic,how='left', left_on='Recipe_ID', right_on='RECIPE_ID')
recipe_tf_idf_df = recipe_tf_idf_df[['Recipe_ID','RECIPE_NM_KO','Ingredient','TF-IDF Value']]
tf_idf_sorting = recipe_tf_idf_df.sort_values(by=['Recipe_ID', 'Ingredient'], axis=0)
tf_idf_sorting.to_csv('recipe_dataset/tf_idf_sorting.csv', encoding="utf-8-sig")

In [41]:
tf_idf_df = pd.read_csv('recipe_dataset/tf_idf_sorting.csv')

In [43]:
tfidf_weight = pd.DataFrame(tf_idf_df['TF-IDF Value'].mul(tf_idf_df['SCORE'], axis=0, level=None, fill_value=None))

In [49]:
tf_idf_df.insert(4, 'TF-IDF_WEIGHT',tfidf_weight)

In [50]:
tf_idf_df

Unnamed: 0,RECIPE_ID,RECIPE_NM_KO,Ingredient,TF-IDF Value,TF-IDF_WEIGHT,SCORE
0,1,나물비빔밥,간장,0.067706,0.203117,3
1,1,나물비빔밥,계란,0.156383,0.156383,1
2,1,나물비빔밥,고사리,0.360205,1.801023,5
3,1,나물비빔밥,고추장,0.180763,0.903817,5
4,1,나물비빔밥,다진마늘,0.085699,0.171398,2
...,...,...,...,...,...,...
5557,537,콩비지동그랑땡,전분,0.421995,1.687980,4
5558,537,콩비지동그랑땡,참기름,0.085066,0.170131,2
5559,537,콩비지동그랑땡,콩비지,0.541392,2.706960,5
5560,537,콩비지동그랑땡,호박,0.283380,0.283380,1


In [51]:
tf_idf_df.to_csv('tf_idf_df.csv', encoding="utf-8-sig")