# Advanced Content-based filtering

if a user likes an item, she will also like other items with similar characteristics

a liked book X authored by Y
> a will get recommendation of books authored by Y

In [2]:
import pandas as pd
import numpy as np

from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import string

## Importing Data

In [3]:
df = pd.read_csv('data/food.csv')
df.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,"white balsamic vinegar, lemon juice, lemon rin..."
1,2,chicken minced salad,Healthy Food,non-veg,"olive oil, chicken mince, garlic (minced), oni..."
2,3,sweet chilli almonds,Snack,veg,"almonds whole, egg white, curry leaves, salt, ..."
3,4,tricolour salad,Healthy Food,veg,"vinegar, honey/sugar, soy sauce, salt, garlic ..."
4,5,christmas cake,Dessert,veg,"christmas dry fruits (pre-soaked), orange zest..."


In [4]:
len(list(df['Name'].unique()))

400

In [5]:
df['C_Type'].unique()

array(['Healthy Food', 'Snack', 'Dessert', 'Japanese', 'Indian', 'French',
       'Mexican', 'Italian', 'Chinese', 'Beverage', 'Thai', 'Korean',
       ' Korean', 'Vietnames', 'Nepalese', 'Spanish'], dtype=object)

In [6]:
df['Veg_Non'].unique()

array(['veg', 'non-veg'], dtype=object)

In [7]:
len(df)

400

--> Small dataset...

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Food_ID   400 non-null    int64 
 1   Name      400 non-null    object
 2   C_Type    400 non-null    object
 3   Veg_Non   400 non-null    object
 4   Describe  400 non-null    object
dtypes: int64(1), object(4)
memory usage: 15.8+ KB


In [9]:
# Let's make a function to remove all the punctuation from the "Describe" column
def text_cleaning(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [10]:
df['Describe'] = df['Describe'].apply(text_cleaning)

In [11]:
df.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,white balsamic vinegar lemon juice lemon rind ...
1,2,chicken minced salad,Healthy Food,non-veg,olive oil chicken mince garlic minced onion sa...
2,3,sweet chilli almonds,Snack,veg,almonds whole egg white curry leaves salt suga...
3,4,tricolour salad,Healthy Food,veg,vinegar honeysugar soy sauce salt garlic clove...
4,5,christmas cake,Dessert,veg,christmas dry fruits presoaked orange zest lem...


In [12]:
df.duplicated().sum()

0

In [13]:
df.isnull().sum()

Food_ID     0
Name        0
C_Type      0
Veg_Non     0
Describe    0
dtype: int64

In [14]:
df.describe()

Unnamed: 0,Food_ID
count,400.0
mean,200.5
std,115.614301
min,1.0
25%,100.75
50%,200.5
75%,300.25
max,400.0


## Advanced Content Based Filtering

Including all the features that will help in recommeding better

In [15]:
features = ['C_Type', 'Veg_Non', 'Describe']

In [16]:
# Soup represents a mixture of elements
# Similarily, I am making one column that will have all the important features
# I am simply concatenating the strings

def create_soup(x):
    return x['C_Type'] + " " + x['Veg_Non'] + " " + x['Describe']

In [17]:
df['soup'] = df.apply(create_soup, axis=1)

In [18]:
df.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe,soup
0,1,summer squash salad,Healthy Food,veg,white balsamic vinegar lemon juice lemon rind ...,Healthy Food veg white balsamic vinegar lemon ...
1,2,chicken minced salad,Healthy Food,non-veg,olive oil chicken mince garlic minced onion sa...,Healthy Food non-veg olive oil chicken mince g...
2,3,sweet chilli almonds,Snack,veg,almonds whole egg white curry leaves salt suga...,Snack veg almonds whole egg white curry leaves...
3,4,tricolour salad,Healthy Food,veg,vinegar honeysugar soy sauce salt garlic clove...,Healthy Food veg vinegar honeysugar soy sauce ...
4,5,christmas cake,Dessert,veg,christmas dry fruits presoaked orange zest lem...,Dessert veg christmas dry fruits presoaked ora...


What is `CountVectorizer` ?

> Convert a collection of text documents to a matrix of token counts. This implementation produces a sparse representation of the counts using `scipy.sparse.csr_matrix`

In [19]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [23]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [24]:
cosine_sim

array([[1.        , 0.37282186, 0.25253814, ..., 0.33071891, 0.10482848,
        0.39036003],
       [0.37282186, 1.        , 0.17574991, ..., 0.36989772, 0.1823843 ,
        0.42447636],
       [0.25253814, 0.17574991, 1.        , ..., 0.13363062, 0.22237479,
        0.13801311],
       ...,
       [0.33071891, 0.36989772, 0.13363062, ..., 1.        , 0.13867505,
        0.32274861],
       [0.10482848, 0.1823843 , 0.22237479, ..., 0.13867505, 1.        ,
        0.07161149],
       [0.39036003, 0.42447636, 0.13801311, ..., 0.32274861, 0.07161149,
        1.        ]])

In [25]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['Name'])

In [27]:
display(indices)

Name
summer squash salad                                          0
chicken minced salad                                         1
sweet chilli almonds                                         2
tricolour salad                                              3
christmas cake                                               4
                                                          ... 
Kimchi Toast                                               395
Tacos de Gobernador (Shrimp, Poblano, and Cheese Tacos)    396
Melted Broccoli Pasta With Capers and Anchovies            397
Lemon-Ginger Cake with Pistachios                          398
Rosemary Roasted Vegetables                                399
Length: 400, dtype: int64

In [28]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:6]

    food_indices = [i[0] for i in sim_scores]
    return df['Name'].iloc[food_indices]

In [29]:
get_recommendations('tricolour salad', cosine_sim)

1                         chicken minced salad
103                             chilli chicken
27                     vegetable som tam salad
177                        oats shallots pulao
69     shepherds salad (tamatar-kheera salaad)
Name: Name, dtype: object