# Content-based Recommender System
---
Code and data based on: https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

In [3]:
# source of sample data: 
# https://github.com/nikitaa30/Content-based-Recommender-System/blob/master/sample-data.csv

ds = pd.read_csv("data/sample-data.csv")

In [4]:
ds.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [5]:
ds.tail()

Unnamed: 0,id,description
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...
499,500,All-wear shorts - Time to simplify? Our All-We...


In [6]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

In [14]:
tfidf_matrix.shape, type(tfidf_matrix)

((500, 52262), scipy.sparse.csr.csr_matrix)

In [28]:
print(tfidf_matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [39]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {}

In [40]:
cosine_similarities.shape

(500, 500)

In [49]:
for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]  # largest 99
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices] 
    results[row['id']] = similar_items[1:]

In [58]:
results[1][:10]

[(0.22037921472617453, 19),
 (0.16938950913002357, 494),
 (0.16769458065321555, 18),
 (0.16485527745622977, 172),
 (0.148126154605864, 442),
 (0.14577863284367545, 171),
 (0.1413764236536125, 21),
 (0.13884463426216978, 495),
 (0.13879533331363048, 25),
 (0.13813550299091404, 496)]

In [59]:
def item(id):  
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

In [62]:
# Just reads the results out of the dictionary.

def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")   
    print("-------")    

    recs = results[item_id][:num]   
    for rec in recs: 
        print("Recommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

In [65]:
recommend(1, 5)

Recommending 5 products similar to Active classic boxers...
-------
Recommended: Cap 1 boxer briefs (score:0.22037921472617453)
Recommended: Active boxer briefs (score:0.16938950913002357)
Recommended: Cap 1 bottoms (score:0.16769458065321555)
Recommended: Cap 1 t-shirt (score:0.16485527745622977)
Recommended: Cap 3 bottoms (score:0.148126154605864)
