# TF-IDF Content Based Filtering

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import string

## Importing Data

In [2]:
df = pd.read_csv('data/food.csv')
df.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,"white balsamic vinegar, lemon juice, lemon rin..."
1,2,chicken minced salad,Healthy Food,non-veg,"olive oil, chicken mince, garlic (minced), oni..."
2,3,sweet chilli almonds,Snack,veg,"almonds whole, egg white, curry leaves, salt, ..."
3,4,tricolour salad,Healthy Food,veg,"vinegar, honey/sugar, soy sauce, salt, garlic ..."
4,5,christmas cake,Dessert,veg,"christmas dry fruits (pre-soaked), orange zest..."


In [3]:
len(list(df['Name'].unique()))

400

In [4]:
df['C_Type'].unique()

array(['Healthy Food', 'Snack', 'Dessert', 'Japanese', 'Indian', 'French',
       'Mexican', 'Italian', 'Chinese', 'Beverage', 'Thai', 'Korean',
       ' Korean', 'Vietnames', 'Nepalese', 'Spanish'], dtype=object)

In [5]:
df['Veg_Non'].unique()

array(['veg', 'non-veg'], dtype=object)

In [6]:
len(df)

400

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Food_ID   400 non-null    int64 
 1   Name      400 non-null    object
 2   C_Type    400 non-null    object
 3   Veg_Non   400 non-null    object
 4   Describe  400 non-null    object
dtypes: int64(1), object(4)
memory usage: 15.8+ KB


In [8]:
# Let's make a function to remove all the punctuation from the "Describe" column
def text_cleaning(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [9]:
df['Describe'] = df['Describe'].apply(text_cleaning)

In [10]:
df.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,white balsamic vinegar lemon juice lemon rind ...
1,2,chicken minced salad,Healthy Food,non-veg,olive oil chicken mince garlic minced onion sa...
2,3,sweet chilli almonds,Snack,veg,almonds whole egg white curry leaves salt suga...
3,4,tricolour salad,Healthy Food,veg,vinegar honeysugar soy sauce salt garlic clove...
4,5,christmas cake,Dessert,veg,christmas dry fruits presoaked orange zest lem...


In [11]:
df.duplicated().sum()

0

In [12]:
df.isnull().sum()

Food_ID     0
Name        0
C_Type      0
Veg_Non     0
Describe    0
dtype: int64

In [13]:
df.describe()

Unnamed: 0,Food_ID
count,400.0
mean,200.5
std,115.614301
min,1.0
25%,100.75
50%,200.5
75%,300.25
max,400.0


## TF-IDF

What is TF-IDF ? **fill here**

In [14]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['Describe'])
tfidf_matrix.shape

(400, 1261)

In [15]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
cosine_sim

array([[1.        , 0.16228366, 0.13001124, ..., 0.1286286 , 0.04277223,
        0.09993639],
       [0.16228366, 1.        , 0.06799336, ..., 0.14878001, 0.05688681,
        0.16917639],
       [0.13001124, 0.06799336, 1.        , ..., 0.03291577, 0.11795401,
        0.01834168],
       ...,
       [0.1286286 , 0.14878001, 0.03291577, ..., 1.        , 0.        ,
        0.10087579],
       [0.04277223, 0.05688681, 0.11795401, ..., 0.        , 1.        ,
        0.        ],
       [0.09993639, 0.16917639, 0.01834168, ..., 0.10087579, 0.        ,
        1.        ]])

In [16]:
# Just considering the Food names from the dataframe
indices = pd.Series(df.index, index=df['Name']).drop_duplicates()
indices

Name
summer squash salad                                          0
chicken minced salad                                         1
sweet chilli almonds                                         2
tricolour salad                                              3
christmas cake                                               4
                                                          ... 
Kimchi Toast                                               395
Tacos de Gobernador (Shrimp, Poblano, and Cheese Tacos)    396
Melted Broccoli Pasta With Capers and Anchovies            397
Lemon-Ginger Cake with Pistachios                          398
Rosemary Roasted Vegetables                                399
Length: 400, dtype: int64

In [17]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:6]

    food_indices = [i[0] for i in sim_scores]
    return df['Name'].iloc[food_indices]

In [18]:
get_recommendations('tricolour salad', cosine_sim)

103             chilli chicken
1         chicken minced salad
27     vegetable som tam salad
282          veg hakka noodles
166             veg fried rice
Name: Name, dtype: object

In [19]:
def create_soup(x):
    return x['C_Type'] + " " + x['Veg_Non'] + " " + x['Describe']

In [20]:
df['soup'] = df.apply(create_soup, axis=1)

In [21]:
df.head(2)

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe,soup
0,1,summer squash salad,Healthy Food,veg,white balsamic vinegar lemon juice lemon rind ...,Healthy Food veg white balsamic vinegar lemon ...
1,2,chicken minced salad,Healthy Food,non-veg,olive oil chicken mince garlic minced onion sa...,Healthy Food non-veg olive oil chicken mince g...


In [22]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['soup'])
tfidf_matrix.shape

(400, 1270)

In [23]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
cosine_sim

array([[1.        , 0.20029425, 0.1245012 , ..., 0.1245012 , 0.04407577,
        0.16016086],
       [0.20029425, 1.        , 0.06676684, ..., 0.15374986, 0.07155718,
        0.22058322],
       [0.1245012 , 0.06676684, 1.        , ..., 0.03498495, 0.11327228,
        0.02237259],
       ...,
       [0.1245012 , 0.15374986, 0.03498495, ..., 1.        , 0.02246653,
        0.09727016],
       [0.04407577, 0.07155718, 0.11327228, ..., 0.02246653, 1.        ,
        0.00592245],
       [0.16016086, 0.22058322, 0.02237259, ..., 0.09727016, 0.00592245,
        1.        ]])

In [24]:
get_recommendations('tricolour salad', cosine_sim)

1         chicken minced salad
103             chilli chicken
27     vegetable som tam salad
106        garlic soya chicken
282          veg hakka noodles
Name: Name, dtype: object