In [12]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [14]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [15]:
vec = TfidfVectorizer()

* fit and transform 'description' column with TFIDF

In [16]:
vectorized = vec.fit_transform(df['description'])
vectorized.shape

(500, 4816)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [31]:
cos_sim = cosine_similarity(vectorized)
cos_sim.shape

(500, 500)

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [33]:
sim_matrix = pd.DataFrame(cos_sim, index=df['id'], drop=True)
sim_matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.327921,0.208198,0.217250,0.170859,0.152743,0.155677,0.114291,0.180879,0.102675,...,0.169169,0.142999,0.145491,0.445103,0.360511,0.334376,0.309250,0.176970,0.201439,0.225981
2,0.327921,1.000000,0.567351,0.198872,0.180809,0.152622,0.172194,0.108677,0.143409,0.109563,...,0.157160,0.120391,0.153036,0.311482,0.281787,0.255443,0.209181,0.129252,0.211397,0.193964
3,0.208198,0.567351,1.000000,0.169440,0.205306,0.173489,0.200608,0.131848,0.151129,0.119336,...,0.169349,0.119604,0.138864,0.288880,0.370122,0.252944,0.239232,0.135099,0.141858,0.157174
4,0.217250,0.198872,0.169440,1.000000,0.341876,0.327101,0.258561,0.117623,0.198795,0.166627,...,0.143997,0.118187,0.128580,0.226760,0.188964,0.222677,0.208169,0.241172,0.211171,0.234756
5,0.170859,0.180809,0.205306,0.341876,1.000000,0.311279,0.238927,0.110349,0.201359,0.169642,...,0.139153,0.115338,0.143116,0.198302,0.170243,0.205219,0.253940,0.306181,0.148323,0.186818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,0.334376,0.255443,0.252944,0.222677,0.205219,0.183774,0.146374,0.118026,0.184657,0.133002,...,0.204923,0.165733,0.158425,0.412121,0.434495,1.000000,0.570330,0.174132,0.181869,0.222767
497,0.309250,0.209181,0.239232,0.208169,0.253940,0.224929,0.180597,0.109725,0.201743,0.151643,...,0.168963,0.162770,0.157884,0.335268,0.355386,0.570330,1.000000,0.218035,0.190505,0.218547
498,0.176970,0.129252,0.135099,0.241172,0.306181,0.267633,0.181311,0.099160,0.205992,0.156121,...,0.132418,0.115043,0.115340,0.183297,0.165956,0.174132,0.218035,1.000000,0.141871,0.170453
499,0.201439,0.211397,0.141858,0.211171,0.148323,0.165511,0.131069,0.125753,0.168572,0.111763,...,0.195399,0.149670,0.176218,0.212583,0.149122,0.181869,0.190505,0.141871,1.000000,0.558464


In [43]:
results = {}
for i in range(len(sim_matrix.columns)):
    results[sim_matrix.index[i]] = sim_matrix[i].sort_values(ascending=False).index.to_list()

In [54]:
pd.DataFrame(results)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
0,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
1,19,3,2,159,308,438,354,220,417,425,...,98,286,347,19,19,173,22,302,500,499
2,494,1,495,384,96,184,403,262,469,466,...,124,56,98,495,494,22,360,386,462,463
3,495,19,300,379,281,387,393,255,474,428,...,116,491,138,1,496,359,359,267,463,462
4,442,494,494,343,204,268,464,261,230,135,...,41,372,116,496,173,497,23,212,32,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,42,396,490,490,332,238,231,468,490,319,...,425,425,468,48,134,131,8,332,48,396
496,134,319,319,241,99,319,418,114,319,371,...,455,135,455,123,241,137,150,48,332,490
497,434,332,332,418,258,48,99,48,332,332,...,420,114,420,319,49,101,220,434,319,319
498,425,48,371,101,241,332,258,371,371,123,...,300,420,155,332,371,241,123,156,434,332


* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [55]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(item_d, count):
    return results[item_d][1:count+1]

* show top 5 the most similar items for item with idem_id = 11

In [56]:
get_recommendations(11, 5)

[419, 465, 412, 404, 475]