# Content-Based Filtering Example

We know that content-based filtering is a recommendation technique that leverages the attributes of items to recommend similar items to users. In this example, we'll use product attributes such as category, name length, and description length to recommend similar products.

In [13]:
import pandas as pd

# Load datasets
customers = pd.read_csv('data/olist_customers_dataset.csv')
geolocations = pd.read_csv('data/olist_geolocation_dataset.csv')
order_items = pd.read_csv('data/olist_order_items_dataset.csv')
payments = pd.read_csv('data/olist_order_payments_dataset.csv')
reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
orders = pd.read_csv('data/olist_orders_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
sellers = pd.read_csv('data/olist_sellers_dataset.csv')
product_category_translation = pd.read_csv('data/product_category_name_translation.csv')

In [14]:
products

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0


## Step 1: Preprocess Data

We'll start by preparing the product data:

- Normalize Numerical Features: Scale features like product name length, description length, and number of photos.
- Encode Categorical Features: Convert the product category names into a numerical format.
- Combine Features: Create a single representation for each product.


In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Selecting relevant columns for content-based filtering
product_features = products[['product_id', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty','product_category_name']].copy()
product_features = product_features.merge(product_category_translation, on='product_category_name', how='left')

# Fill missing values with 0
product_features.fillna(0, inplace=True)

# Normalize numerical features
scaler = MinMaxScaler()
product_features[['product_name_lenght', 'product_description_lenght', 'product_photos_qty']] = scaler.fit_transform(
    product_features[['product_name_lenght', 'product_description_lenght', 'product_photos_qty']]
)
# Encode categorical feature (product_category_name_english)
vectorizer = CountVectorizer()
category_matrix = vectorizer.fit_transform(product_features['product_category_name_english'].astype('str'))

# Combine all features into a single matrix
numerical_features = product_features[['product_name_lenght', 'product_description_lenght', 'product_photos_qty']].values
combined_features = np.hstack((numerical_features, category_matrix.toarray()))


## Step 2: Compute Similarity
We use cosine similarity to measure the similarity between products based on their combined features.

In [11]:
# Compute cosine similarity between products
cosine_sim = cosine_similarity(combined_features)

## Step 3: Recommend Products
Given a product, we can recommend the top N similar products based on the computed similarities.



In [16]:
# Create a function to get top N similar products
def get_similar_products(product_id, top_n=5):
    product_idx = product_features.index[product_features['product_id'] == product_id].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[product_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_products = sim_scores[1:top_n + 1]
    top_product_indices = [i[0] for i in top_products]
    return product_features.iloc[top_product_indices][['product_id', 'product_category_name_english']]

# Example: Get top 5 products similar to a given product_id
example_product_id = product_features['product_id'].iloc[0]  # Using the first product as an example
similar_products = get_similar_products(example_product_id, top_n=5)
similar_products


Unnamed: 0,product_id,product_category_name_english
5177,e96867993d08fadbc6f828871885f897,perfumery
11636,ac06fe86362d2401d00a5a03a1e693ac,perfumery
4790,5c17779da282cc147ca50c6904cf3d01,perfumery
27267,9af4d74cc1b1181f4f35d4286c27bbcd,perfumery
22692,11e02f2cd150fa54525a1109f61ea2de,perfumery
