In [125]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

In [126]:
data = pd.read_csv('indian_food.csv')
data.head()

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East


In [127]:
data.shape

(255, 9)

In [128]:
data.isna().sum()

name              0
ingredients       0
diet              0
prep_time         0
cook_time         0
flavor_profile    0
course            0
state             0
region            1
dtype: int64

In [129]:
data[data['region'].isna() == True]

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
110,Panjeeri,"Whole wheat flour, musk melon seeds, poppy see...",vegetarian,10,25,sweet,dessert,Uttar Pradesh,


In [130]:
data.fillna('North', inplace=True)

In [131]:
(data == -1).sum()

name               0
ingredients        0
diet               0
prep_time         30
cook_time         28
flavor_profile     0
course             0
state              0
region             0
dtype: int64

In [132]:
data['prep_time'] = data['prep_time'].replace(-1, '-')
data['cook_time'] = data['cook_time'].replace(-1, '-')

In [133]:
data.to_csv('indian_food_cleaned.csv')

In [134]:
data.ingredients

0                        Maida flour, yogurt, oil, sugar
1                                Gram flour, ghee, sugar
2           Carrots, milk, sugar, ghee, cashews, raisins
3      Flour, ghee, kewra, milk, clarified butter, su...
4      Milk powder, plain flour, baking powder, ghee,...
                             ...                        
250              Glutinous rice, black sesame seeds, gur
251    Coconut milk, egg yolks, clarified butter, all...
252    Cottage cheese, dry dates, dried rose petals, ...
253    Milk powder, dry fruits, arrowroot powder, all...
254    Brown rice, fennel seeds, grated coconut, blac...
Name: ingredients, Length: 255, dtype: object

In [135]:
data.ingredients[0]

'Maida flour, yogurt, oil, sugar'

In [136]:
df = data.copy()

In [137]:
df['ingredients'] = data['ingredients'].apply(lambda x: x.lower())

In [138]:
df['ingredients'] = data['ingredients'].apply(lambda x: x.replace(', ', ' '))

In [139]:
ps = PorterStemmer()
def text_preprocess(text):
    new_text = []
    for i in text.split():
        lower = i.lower()
        new_text.append(ps.stem(lower))
    return ' '.join(new_text)

In [140]:
df['ingredients'].apply(text_preprocess)

0                           maida flour yogurt oil sugar
1                                  gram flour ghee sugar
2                   carrot milk sugar ghee cashew raisin
3      flour ghee kewra milk clarifi butter sugar alm...
4      milk powder plain flour bake powder ghee milk ...
                             ...                        
250                     glutin rice black sesam seed gur
251    coconut milk egg yolk clarifi butter all purpo...
252    cottag chees dri date dri rose petal pistachio...
253    milk powder dri fruit arrowroot powder all pur...
254    brown rice fennel seed grate coconut black pep...
Name: ingredients, Length: 255, dtype: object

In [141]:
df.to_csv('vector_data.csv')

In [142]:
cv = CountVectorizer()
ingredients_matrix = cv.fit_transform(df['ingredients']).toarray()
ingredients_matrix

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(255, 336))

In [143]:
df.columns

Index(['name', 'ingredients', 'diet', 'prep_time', 'cook_time',
       'flavor_profile', 'course', 'state', 'region'],
      dtype='object')

In [144]:
def recommend(user_ingredients):
    user_ingredients = ' '.join([ingredient.lower().strip() for ingredient in user_ingredients])
    user_ingredients_matrix = cv.transform([user_ingredients])
    similarity = cosine_similarity(user_ingredients_matrix, ingredients_matrix).flatten()
    top_indices = similarity.argsort()[-5:][::-1]
    recommendations = data.iloc[top_indices].copy()
    return recommendations[['name', 'ingredients', 'diet', 'prep_time', 'cook_time', 'flavor_profile', 'course', 'state', 'region']]

In [145]:
recommend(['milk', 'sugar', 'ghee', 'carrots'])

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
19,Sohan papdi,"Gram flour, ghee, sugar, milk, cardamom",vegetarian,-,60,sweet,dessert,Maharashtra,West
25,Ledikeni,"Chhena, sugar, ghee",vegetarian,45,45,sweet,dessert,West Bengal,East
21,Chhena kheeri,"Chhena, sugar, milk",vegetarian,-,60,sweet,dessert,Odisha,East
56,Basundi,"Sugar, milk, nuts",vegetarian,10,35,sweet,dessert,Gujarat,West
