In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
foods = pd.read_csv('foods.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
foods.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,"white balsamic vinegar, lemon juice, lemon rin..."
1,2,chicken minced salad,Healthy Food,non-veg,"olive oil, chicken mince, garlic (minced), oni..."
2,3,sweet chilli almonds,Snack,veg,"almonds whole, egg white, curry leaves, salt, ..."
3,4,tricolour salad,Healthy Food,veg,"vinegar, honey/sugar, soy sauce, salt, garlic ..."
4,5,christmas cake,Dessert,veg,"christmas dry fruits (pre-soaked), orange zest..."


In [4]:
foods.shape

(400, 5)

In [5]:
foods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Food_ID   400 non-null    int64 
 1   Name      400 non-null    object
 2   C_Type    400 non-null    object
 3   Veg_Non   400 non-null    object
 4   Describe  400 non-null    object
dtypes: int64(1), object(4)
memory usage: 15.8+ KB


In [6]:
foods.isnull().sum()

Food_ID     0
Name        0
C_Type      0
Veg_Non     0
Describe    0
dtype: int64

In [7]:
ratings.head()

Unnamed: 0,User_ID,Food_ID,Rating
0,1.0,88.0,4.0
1,1.0,46.0,3.0
2,1.0,24.0,5.0
3,1.0,25.0,4.0
4,2.0,49.0,1.0


In [8]:
ratings.shape

(512, 3)

In [9]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512 entries, 0 to 511
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   User_ID  511 non-null    float64
 1   Food_ID  511 non-null    float64
 2   Rating   511 non-null    float64
dtypes: float64(3)
memory usage: 12.1 KB


In [10]:
ratings.isnull().sum()

User_ID    1
Food_ID    1
Rating     1
dtype: int64

In [11]:
ratings.isnull().sum()

User_ID    1
Food_ID    1
Rating     1
dtype: int64

In [12]:
ratings[ratings['User_ID'].isnull()]

Unnamed: 0,User_ID,Food_ID,Rating
511,,,


In [13]:
ratings.dropna(inplace=True)

In [14]:
ratings.isnull().sum()

User_ID    0
Food_ID    0
Rating     0
dtype: int64

In [15]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 511 entries, 0 to 510
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   User_ID  511 non-null    float64
 1   Food_ID  511 non-null    float64
 2   Rating   511 non-null    float64
dtypes: float64(3)
memory usage: 16.0 KB


In [16]:
n_ratings = len(ratings)
n_movies = len(ratings['Food_ID'].unique())
n_users = len(ratings['User_ID'].unique())

In [17]:
user_freq = ratings[['User_ID', 'Food_ID']].groupby('User_ID').count().reset_index()
user_freq.columns = ['User_ID', 'n_ratings']
user_freq.head()

Unnamed: 0,User_ID,n_ratings
0,1.0,4
1,2.0,4
2,3.0,9
3,4.0,6
4,5.0,6


In [18]:
food_stats = ratings.groupby('Food_ID')[['Rating']].agg(['count', 'mean'])
food_stats.columns = food_stats.columns.droplevel()

In [19]:
food_stats.head()

Unnamed: 0_level_0,count,mean
Food_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,7.5
2.0,3,6.0
3.0,2,4.0
4.0,2,6.0
5.0,6,6.5


In [20]:
from scipy.sparse import csr_matrix
  
def create_matrix(df):
      
    N = len(df['User_ID'].unique())
    M = len(df['Food_ID'].unique())
      
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["User_ID"]), list(range(N))))
    food_mapper = dict(zip(np.unique(df["Food_ID"]), list(range(M))))
      
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["User_ID"])))
    food_inv_mapper = dict(zip(list(range(M)), np.unique(df["Food_ID"])))
      
    user_index = [user_mapper[i] for i in df['User_ID']]
    food_index = [food_mapper[i] for i in df['Food_ID']]
  
    X = csr_matrix((df["Rating"], (food_index, user_index)), shape=(M, N))
      
    return X, user_mapper, food_mapper, user_inv_mapper, food_inv_mapper
  
X, user_mapper, food_mapper, user_inv_mapper, food_inv_mapper = create_matrix(ratings)

In [24]:
from sklearn.neighbors import NearestNeighbors
"""
Find similar movies using KNN
"""
def find_similar_foods(food_id, X, k, metric='cosine', show_distance=False):
      
    neighbour_ids = []
      
    food_ind = food_mapper[food_id]
    food_vec = X[food_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    food_vec = food_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(food_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(food_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids
  
  
food_names = dict(zip(foods['Food_ID'], foods['Name']))

In [27]:
food_id = 5
  
similar_ids = find_similar_foods(food_id, X, k=10)
food_name = food_names[food_id]
  
print(f"Since you ate {food_name}")
for i in similar_ids:
    print(food_names[i])

Since you ate christmas cake
french pork chop
egg in a blanket
chicken paella
couscous with ratatouille - tangy tomato sauce
prawn potato soup
apple kheer
sous-vide salmon tikka
chicken minced salad
roast turkey with cranberry sauce
cinnamon star cookies
