# TF IDF demo

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

## General example

In [26]:
corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [27]:
X.shape

(4, 9)

In [28]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


## Our data

In [20]:
import pickle as pkl

In [None]:
num_recipes = 10000

In [82]:
data = pkl.load(open('data_df.pkl','rb'))
data = data.iloc[:num_recipes, :]
f = lambda x: f"""{x['title_1']}\n{"; ".join(x['ingredients_3'])}\n{"; ".join(x['instructions_3'])}"""
data['text'] = data.apply(f, axis=1)

In [71]:
vectorizer = TfidfVectorizer(stop_words='english')
tf_idfs = vectorizer.fit_transform(data['text'].values)

In [72]:
vectorizer.get_feature_names_out()

array(['00', '02', '07', ..., 'zoom', 'zucchini', 'zylitol'], dtype=object)

In [73]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

tf_idfs = sparse.csr_matrix(tf_idfs)
similarities = cosine_similarity(tf_idfs)

In [81]:
similarities

array([[1.        , 0.07804269, 0.00785451, ..., 0.01061459, 0.0379411 ,
        0.02123816],
       [0.07804269, 1.        , 0.0526755 , ..., 0.03191065, 0.11310457,
        0.12676689],
       [0.00785451, 0.0526755 , 1.        , ..., 0.03284432, 0.03249093,
        0.04900683],
       ...,
       [0.01061459, 0.03191065, 0.03284432, ..., 1.        , 0.01573386,
        0.05786103],
       [0.0379411 , 0.11310457, 0.03249093, ..., 0.01573386, 1.        ,
        0.08762453],
       [0.02123816, 0.12676689, 0.04900683, ..., 0.05786103, 0.08762453,
        1.        ]])