In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/My Drive/ChefGPT')


In [None]:
import pandas as pd
food_df = pd.read_csv('posts.csv')
user_df = pd.read_csv('users.csv')
user_interaction_df = pd.read_csv('user_interaction.csv')

In [None]:
food_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3550 entries, 0 to 3549
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   course        3550 non-null   object
 1   cuisine       3550 non-null   object
 2   description   3550 non-null   object
 3   diet          3550 non-null   object
 4   dishName      3550 non-null   object
 5   imageUrl      3550 non-null   object
 6   ingredients   3550 non-null   object
 7   instructions  3550 non-null   object
 8   likeCount     3550 non-null   int64 
 9   swipeCount    3550 non-null   int64 
 10  timeTaken     3550 non-null   object
 11  userId        3550 non-null   object
 12  food_id       3550 non-null   object
dtypes: int64(2), object(11)
memory usage: 360.7+ KB


In [None]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       397 non-null    int64 
 1   city      397 non-null    object
 2   country   397 non-null    object
 3   gender    397 non-null    object
 4   language  397 non-null    object
 5   userame   397 non-null    object
 6   user_id   397 non-null    object
dtypes: int64(1), object(6)
memory usage: 21.8+ KB


In [None]:
user_interaction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883382 entries, 0 to 883381
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  883382 non-null  object
 1   food_id  883382 non-null  object
 2   swiped   883382 non-null  int64 
 3   liked    883382 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 27.0+ MB


In [None]:
food_names = food_df.set_index('food_id')['dishName'].to_dict()
n_users = len(user_interaction_df.user_id.unique())
n_items = len(user_interaction_df.food_id.unique())
print("Number of unique users:", n_users)
print("Number of unique food:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(user_interaction_df))
print("Therefore: ", len(user_interaction_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 399
Number of unique food: 3550
The full rating matrix will have: 1416450 elements.
----------
Number of ratings: 883382
Therefore:  62.365914786967416 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [None]:

# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.swipes = user_interaction_df.copy()

        # Extract all user IDs and movie IDs
        users = user_interaction_df.user_id.unique()
        foods = user_interaction_df.food_id.unique()

        #--- Producing new continuous IDs for users and food ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.foodid2idx = {o:i for i,o in enumerate(foods)}

        # Obtained continuous ID for users and food
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2foodid = {i:o for o,i in self.foodid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.swipes.food_id = user_interaction_df.food_id.apply(lambda x: self.foodid2idx[x])
        self.swipes.user_id = user_interaction_df.user_id.apply(lambda x: self.userid2idx[x])


        self.x = self.swipes.drop(['swiped', 'liked'], axis=1).values
        self.y = self.swipes['swiped'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.swipes)

In [None]:
train_set = Loader()

In [None]:
import pickle

# Save train_set to a file using pickle
train_set_path = '/content/drive/My Drive/ChefGPT/train_set.pkl'
with open(train_set_path, 'wb') as f:
    pickle.dump(train_set, f)

print("train_set saved successfully at:", train_set_path)


train_set saved successfully at: /content/drive/My Drive/ChefGPT/train_set.pkl


In [None]:
train_set.idx2foodid

{0: '-Nwy3-zVfxq6ivUROnCw',
 1: '-Nwy8ex6KJoYRGv7LJ0g',
 2: '-Nwy7PIgV7EZN_7rUw-0',
 3: '-Nwy8xEr2Duzu1TV1121',
 4: '-Nwy6XZML7guT75yPvLF',
 5: '-Nwy60_VjeItlz5DdvjV',
 6: '-Nwy8DzPV-jl4K2WPySO',
 7: '-Nwy6FgSn2NtW8tXMmGR',
 8: '-Nwy7ixKv7sJqL0l7rZg',
 9: '-Nwy2MLeSN0Hg8Xx7iAG',
 10: '-Nwy3-90KgWm44fuz8KV',
 11: '-Nwy5d4QOZLbwzn2cm1T',
 12: '-Nwy4zBbLEW6ilE4ZJhH',
 13: '-Nwy5BuAAlYi8Mu3MGtp',
 14: '-Nwy6WRxj-e1-lSfPlnL',
 15: '-Nwy99VVM2yDLf9NxO2o',
 16: '-Nwy8Ki1QZUCqTV2iz1Q',
 17: '-Nwy3l0pWJeASpPYS8kV',
 18: '-Nwy4ZBehTd0Ws9DFbIH',
 19: '-Nwy2ZpK3RU3Wryb7f0r',
 20: '-Nwy4DQTT2hep_2MHoxH',
 21: '-Nwy88Wk_YlPGG6TPKZ0',
 22: '-Nwy1i0-L_o1-PJliSwB',
 23: '-Nwy39shezm5bpUv-mpG',
 24: '-Nwy4pXNsXXVY0k3K2-x',
 25: '-Nwy2f87imm-wfvDg72Z',
 26: '-Nwy2V1P3QrNuYpHFEV0',
 27: '-Nwy8_4jXWFHe0u9jkLS',
 28: '-Nwy1yJF6YUBW8JmpnRv',
 29: '-Nwy5Ua1cxWV-L6J53r4',
 30: '-Nwy54nxaIPzq2Eqs6wx',
 31: '-Nwy4yvapAIvaL6GNoO5',
 32: '-Nwy3-hH5kGxKy8kssFB',
 33: '-Nwy4AwFtsgX5y9FuTSt',
 34: '-Nwy6KdsYBU8pOdHMp

In [None]:
num_epochs = 20
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 20, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(399, 8)
  (item_factors): Embedding(3550, 8)
)
user_factors.weight tensor([[0.0261, 0.0167, 0.0216,  ..., 0.0295, 0.0456, 0.0258],
        [0.0320, 0.0402, 0.0141,  ..., 0.0474, 0.0150, 0.0054],
        [0.0487, 0.0264, 0.0214,  ..., 0.0405, 0.0400, 0.0200],
        ...,
        [0.0028, 0.0361, 0.0336,  ..., 0.0486, 0.0272, 0.0125],
        [0.0170, 0.0019, 0.0014,  ..., 0.0403, 0.0329, 0.0012],
        [0.0287, 0.0379, 0.0483,  ..., 0.0207, 0.0296, 0.0330]])
item_factors.weight tensor([[0.0338, 0.0335, 0.0465,  ..., 0.0283, 0.0003, 0.0218],
        [0.0396, 0.0304, 0.0484,  ..., 0.0133, 0.0160, 0.0304],
        [0.0352, 0.0093, 0.0404,  ..., 0.0490, 0.0013, 0.0178],
        ...,
        [0.0262, 0.0494, 0.0481,  ..., 0.0049, 0.0061, 0.0472],
        [0.0187, 0.0096, 0.0167,  ..., 0.0470, 0.0110, 0.0226],
        [0.0480, 0.0485, 0.0485,  ..., 0.0435, 0.0205, 0.0268]])


In [None]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/20 [00:00<?, ?it/s]

iter #0 Loss: 0.02841308792966383
iter #1 Loss: 0.0007884488715279692
iter #2 Loss: 0.0006789604292327813
iter #3 Loss: 0.0006091051893126044
iter #4 Loss: 0.0005620905136926446
iter #5 Loss: 0.0005287597141008173
iter #6 Loss: 0.000504135187610302
iter #7 Loss: 0.0004859206880373969
iter #8 Loss: 0.00047223621002384305
iter #9 Loss: 0.00046061795132240174
iter #10 Loss: 0.00045172781996937416
iter #11 Loss: 0.00044479486135206133
iter #12 Loss: 0.00043971240736169823
iter #13 Loss: 0.000435579297354538
iter #14 Loss: 0.00043131069638802983
iter #15 Loss: 0.0004289470015992088
iter #16 Loss: 0.00042619973462166346
iter #17 Loss: 0.0004245515538210956
iter #18 Loss: 0.00042323480773333446
iter #19 Loss: 0.0004227819503668803


In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.2402, 0.2229, 0.2181,  ..., 0.2057, 0.2422, 0.2245],
        [0.2172, 0.2298, 0.2384,  ..., 0.2175, 0.2229, 0.2275],
        [0.2343, 0.2330, 0.2399,  ..., 0.2208, 0.2310, 0.2310],
        ...,
        [0.2120, 0.2259, 0.2378,  ..., 0.2022, 0.2332, 0.2324],
        [0.2160, 0.2203, 0.2281,  ..., 0.2169, 0.2145, 0.2328],
        [0.2163, 0.2340, 0.2492,  ..., 0.2080, 0.2197, 0.2248]],
       device='cuda:0')
item_factors.weight tensor([[0.5449, 0.5409, 0.5710,  ..., 0.5292, 0.5337, 0.5450],
        [0.5219, 0.5472, 0.5415,  ..., 0.5284, 0.5475, 0.5309],
        [0.5611, 0.5170, 0.5562,  ..., 0.5638, 0.5787, 0.5792],
        ...,
        [0.5915, 0.5589, 0.5651,  ..., 0.5380, 0.5486, 0.5497],
        [0.5589, 0.5117, 0.5595,  ..., 0.5412, 0.5162, 0.5366],
        [0.5372, 0.5609, 0.5681,  ..., 0.5739, 0.5503, 0.5475]],
       device='cuda:0')


In [None]:

trained_food_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:

from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=8, random_state=0).fit(trained_food_embeddings)



In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(8):
  print("Cluster #{}".format(cluster))
  fods = []
  for foodidx in np.where(kmeans.labels_ == cluster)[0]:
    foodid = train_set.idx2foodid[foodidx]
    rat_count = user_interaction_df.loc[user_interaction_df['food_id']==foodid].count()[0]
    fods.append((food_names[foodid.replace("-","")], rat_count))
  for mov in sorted(fods, key=lambda tup: tup[1], reverse=True)[:8]:
    print("\t", mov[0])

Cluster #0
	 Hyderabadi Khadi Masoor Dal Recipe - Hyderabadi Style Lentil Curry
	 पुदीना और अनार का रायता रेसिपी - Mint And Pomegranate Raita (Recipe In Hindi)
	 करूवेपिल्लई पचड़ी रेसिपी - Karuvepillai Pachadi Recipe
	 Chikkudukaya Vepudu Recipe-Broad Beans Masala Poriyal
	 Delicious Breakfast Anda Ghotala Recipe With Butter Pav
	 Methi Chaman Recipe -Paneer In Dark Leafy Greens Based Gravy
	 Maize Daliya Khichri Recipe
	 Stuffed Masala Aloo Naan Recipe Made Without Oven
Cluster #1
	 टोफू भुर्जी रेसिपी - Tofu Bhurji Recipe
	 Chicken Chops Recipe
	 Dappalam Recipe- Andhra Vegetable Stew Recipe
	 Kashmiri Style Apple Tamatar Sabzi Recipe - No Onion No Garlic 
	 Pyaz Wali Bhindi Recipe | Bhindi Do Pyaza - Okra Onion Stir Fry
	 Kerala Palada Pradhaman Recipe
	 Chilli Paneer & Oats Dosa Recipe
	 पनीर भरमा भिन्डी रेसिपी - Stuffed Bhindi With Paneer (Recipe In Hindi)
Cluster #2
	 Pahari Style Phanu Recipe - Delicious Mixed Dal 
	 लोबिया मसाला रेसिपी - Lobia Masala (Recipe In Hindi)
	 Pudina & 

In [None]:
def get_cluster_elements(input_food_id, kmeans_model, embeddings, dataset, food_names):
    # Predict the cluster label of the input food ID
    if input_food_id not in dataset.foodid2idx:
        print("Input food ID not found.")
        return []

    input_food_idx = dataset.foodid2idx[input_food_id]
    cluster_label = kmeans_model.predict(embeddings[input_food_idx].reshape(1, -1))

    # Find all food names belonging to the same cluster
    cluster_food_names = []
    for idx, label in enumerate(kmeans_model.labels_):
        if label == cluster_label:
            food_id = dataset.idx2foodid[idx]
            food_name = food_names[food_id]
            cluster_food_names.append(food_name)

    return cluster_food_names


In [None]:
import numpy as np

def print_cluster_movies(kmeans, train_set, user_interaction_df, food_names):
    for cluster in range(8):
        print("Cluster #{}".format(cluster))
        fods = []
        for foodidx in np.where(kmeans.labels_ == cluster)[0]:
            foodid = train_set.idx2foodid[foodidx]
            rat_count = user_interaction_df.loc[user_interaction_df['food_id']==foodid].count()[0]
            fods.append((food_names[foodid.replace("-","")], rat_count))
        for mov in sorted(fods, key=lambda tup: tup[1], reverse=True)[:8]:
            print("\t", mov[0])

def get_cluster_elements(input_food_id, kmeans_model, embeddings, dataset, food_names):
    # Predict the cluster label of the input food ID
    if input_food_id not in dataset.foodid2idx:
        print("Input food ID not found.")
        return []

    input_food_idx = dataset.foodid2idx[input_food_id]
    cluster_label = kmeans_model.predict(embeddings[input_food_idx].reshape(1, -1))

    # Find all food names belonging to the same cluster
    cluster_food_names = []
    for idx, label in enumerate(kmeans_model.labels_):
        if label == cluster_label:
            food_id = dataset.idx2foodid[idx]
            food_name = food_names[food_id.replace("-", "")]
            cluster_food_names.append(food_name)

    return cluster_food_names

# Call the function to print cluster movies
print_cluster_movies(kmeans, train_set, user_interaction_df, food_names)

# Call the function to get cluster elements
input_food_id = '-Nwy3pz9L7sm2FwBXQx8'
cluster_food_names = get_cluster_elements(input_food_id, kmeans, trained_food_embeddings, train_set, food_names)

# Print the cluster food names
print("Food names belonging to the same cluster as the input food ID:")
for food_name in cluster_food_names:
    print("\t", food_name)


Cluster #0
	 Hyderabadi Khadi Masoor Dal Recipe - Hyderabadi Style Lentil Curry
	 पुदीना और अनार का रायता रेसिपी - Mint And Pomegranate Raita (Recipe In Hindi)
	 करूवेपिल्लई पचड़ी रेसिपी - Karuvepillai Pachadi Recipe
	 Chikkudukaya Vepudu Recipe-Broad Beans Masala Poriyal
	 Delicious Breakfast Anda Ghotala Recipe With Butter Pav
	 Methi Chaman Recipe -Paneer In Dark Leafy Greens Based Gravy
	 Maize Daliya Khichri Recipe
	 Stuffed Masala Aloo Naan Recipe Made Without Oven
Cluster #1
	 टोफू भुर्जी रेसिपी - Tofu Bhurji Recipe
	 Chicken Chops Recipe
	 Dappalam Recipe- Andhra Vegetable Stew Recipe
	 Kashmiri Style Apple Tamatar Sabzi Recipe - No Onion No Garlic 
	 Pyaz Wali Bhindi Recipe | Bhindi Do Pyaza - Okra Onion Stir Fry
	 Kerala Palada Pradhaman Recipe
	 Chilli Paneer & Oats Dosa Recipe
	 पनीर भरमा भिन्डी रेसिपी - Stuffed Bhindi With Paneer (Recipe In Hindi)
Cluster #2
	 Pahari Style Phanu Recipe - Delicious Mixed Dal 
	 लोबिया मसाला रेसिपी - Lobia Masala (Recipe In Hindi)
	 Pudina & 

In [None]:
input_food_id = '-NWY3-zVfxq6ivUROnCw'
cluster_food_names = get_cluster_elements(input_food_id, kmeans, trained_food_embeddings, train_set, food_names)

# Print the cluster food names
print("Food names belonging to the same cluster as the input food ID:")
for food_name in cluster_food_names:
    print("\t", food_name)

Input food ID not found.
Food names belonging to the same cluster as the input food ID:


In [None]:
def get_cluster_elements(input_food_id, kmeans_model, embeddings, dataset, food_names):
    # Predict the cluster label of the input food ID
    if input_food_id not in dataset.foodid2idx:
        print("Input food ID not found.")
        return []

    input_food_idx = dataset.foodid2idx[input_food_id]
    cluster_label = kmeans_model.predict(embeddings[input_food_idx].reshape(1, -1))

    # Find all food names belonging to the same cluster
    cluster_food_names = []
    for idx, label in enumerate(kmeans_model.labels_):
        if label == cluster_label:
            food_id = dataset.idx2foodid[idx]
            food_name = food_names[food_id.replace("-", "")]
            cluster_food_names.append(food_name)

    return cluster_food_names

# Call the function to print cluster movies


# Call the function to get cluster elements
input_food_id = '-Nwy3jPP1EDiCRmADdjA'
cluster_food_names = get_cluster_elements(input_food_id, kmeans, trained_food_embeddings, train_set, food_names)

# Print the cluster food names
print("Food names belonging to the same cluster as the input food ID:")
for food_name in cluster_food_names:
    print("\t", food_name)

Food names belonging to the same cluster as the input food ID:
	 आलू बोंदा रेसिपी - Potato Bonda Recipe Flavoured With Sambar Powder in Hindi
	 Instant Tindora And Carrot Pickle Recipe
	 Ukadpendi Recipe - Maharashtrian Style Spiced Rice Flour Porridge
	 Kunafa With Mango Cottage Cheese Cream Recipe
	 दाल बंजारा रेसिपी - Dal Banjara Recipe
	 Homemade Thengai Sevai Recipe - Coconut Idiyappam Recipe
	 Maharashtrian Golyachi Amti Recipe (Besan Balls In Tamarind Based Spicy Gravy)
	 Kongunadu Style Senai Kilangu Masala Recipe - Yam in Spicy Gravy
	 Carrot Beans Sukhi Sabzi Recipe With Simple Masala
	 Cabbage Palya Recipe
	 North Indian Style Mushroom Matar Masala Recipe
	 Chickpea Coconut Milk Curry Recipe
	 Stuffed Matar & Gobi Paratha Recipe (Green Pea & Cauliflower Flat Bread)
	 Broken Wheat and Mixed Millet Upma Recipe
	 Mor Kali (Kazhi) Recipe (Savory Rice Flour Breakfast Pudding)
	 Paatolyo Recipe (Goan Sweet Dish)
	 वेंडकाई पुली कूटू रेसिपी - Vendaikai (Okra) Puli Kootu
	 Gujarati K

In [None]:
import pickle

In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/ChefGPT/matrix_factorization_model.pth')


In [None]:
import pickle

# Save the KMeans model to a file
with open('/content/drive/My Drive/ChefGPT/kmeans_final_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
