In [1]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [None]:
df.shape
# df.info()

In [18]:
df.head()

Unnamed: 0,id,description
0,1,active classic boxers - there's a reason why o...
1,2,active sport boxer briefs - skinning up glory ...
2,3,active sport briefs - these superbreathable no...
3,4,"alpine guide pants - skin in, climb ice, switc..."
4,5,"alpine wind jkt - on high ridges, steep ice an..."


* explore DataFrame

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [6]:
tfidf = TfidfVectorizer(stop_words='english')

* fit and transform 'description' column with TFIDF

In [7]:
df.description = df.description.str.lower()

In [17]:
tfidf_matrix = tfidf.fit_transform(df.description)

In [10]:
tfidf_matrix.shape

(500, 4600)

In [15]:
tfidf_matrix[0:5]

<5x4600 sparse matrix of type '<class 'numpy.float64'>'
	with 426 stored elements in Compressed Sparse Row format>

In [9]:
feature_names = tfidf.get_feature_names_out()

In [10]:
feature_names[40:45]

array(['132', '1324', '1327', '133', '1341'], dtype=object)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [21]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [22]:
cosine_sim.shape

(500, 500)

In [27]:
# cosine_sim[0]

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [12]:
# # from walkthrough - don't need this 

#Construct a reverse map of indices and movie titles
# indices = pd.Series(df.index, index=df['id']).drop_duplicates()

In [28]:
results = {}
for i in range(cosine_sim.shape[0]):
    ordered_indices = cosine_sim[i].argsort()
    results[i] = ordered_indices[::-1]


In [30]:
# results

In [17]:
# # from walkthrough

# # Function that takes in movie title as input and outputs most similar movies
# def get_recommendations(title, cosine_sim=cosine_sim):
#     # Get the index of the movie that matches the title
#     idx = indices[title]

#     # Get the pairwsie similarity scores of all movies with that movie
#     sim_scores = list(enumerate(cosine_sim[idx]))

#     # Sort the movies based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     # Get the scores of the 10 most similar movies
#     sim_scores = sim_scores[1:11]

#     # Get the movie indices
#     movie_indices = [i[0] for i in sim_scores]

#     # Return the top 10 most similar movies
#     return metadata['title'].iloc[movie_indices]

id
1     0
2     1
3     2
4     3
5     4
6     5
7     6
8     7
9     8
10    9
dtype: int64

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [47]:
def recommender (item_id, count):
    return results[item_id-1][0:count+1] # this will include itself (and account for indices mismatch(

* show top 5 the most similar items for item with idem_id = 11

In [48]:
closest_indices = recommender (11, 5)

In [49]:
closest_items = df.iloc[closest_indices]

In [50]:
closest_items

Unnamed: 0,id,description
10,11,"baby sunshade top - soft, stretchy polyester f..."
418,419,sunshade hoody - put an end to the sunscreen s...
464,465,baby baggies apron dress - this lively dress k...
411,412,"sunshade shirt - this ultralight, moisture wic..."
403,404,hooded monk sweatshirt - the sacred garment of...
465,466,baby baggies shorts - about 70% of the planet ...
