In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import plotly.express as px

# Loading our data from sephora 
df = pd.read_csv("product_info.csv")

# these are chosen for similarity calculation
df = df[['product_name', 'primary_category', 'secondary_category',
         'price_usd', 'rating', 'loves_count', 'reviews', 'online_only']]
# dropping Na values
df = df.dropna()

# we are now Keeping only categories with at least 5 products for COntent based calculation
counts = df.groupby(['primary_category', 'secondary_category']).size().reset_index(name='item_count')
valid = counts[counts['item_count'] >= 5]
df = df.merge(valid, on=['primary_category', 'secondary_category'])

# Printing the catagory and input where user can select
primary = sorted(df['primary_category'].unique())
for i, cat in enumerate(primary):
    print(f"{i+1}. {cat}")
p = int(input("Select primary category: ")) - 1
chosen_primary = primary[p]


1. Bath & Body
2. Fragrance
3. Hair
4. Makeup
5. Men
6. Mini Size
7. Skincare
8. Tools & Brushes


In [2]:
# we are doing the same for secondary catagory
secondary = sorted(df[df['primary_category'] == chosen_primary]['secondary_category'].unique())
for i, cat in enumerate(secondary):
    print(f"{i+1}. {cat}")
s = int(input("Select secondary category: ")) - 1
chosen_secondary = secondary[s]


1. Bath & Shower
2. Body Care
3. Body Moisturizers
4. Mini Size
5. Self Tanners
6. Value & Gift Sets


In [3]:
# Filtering the data
sub = df[(df['primary_category'] == chosen_primary) & (df['secondary_category'] == chosen_secondary)]

# Lets choose a product given 20 examples to choose for 
products = sub['product_name'].drop_duplicates().tolist()
for i, name in enumerate(products[:20]):
    print(f"{i+1}. {name}")
prod = int(input("Choose a product: ")) - 1
chosen_product = products[prod]




1. Armani Code After Shave Lotion
2. Squalane + Enzyme Sugar Body Scrub
3. Wake the F*ck Up Everyday Body Wash
4. Calm the F*ck Down Everyday Body Wash
5. Leila Lou Everyday Body Wash
6. Rosie Everyday Body Wash
7. James Everyday Body Wash
8. COCO MADEMOISELLE Foaming Shower Gel
9. Sparkle Skin Body Exfoliator
10. J'adore Soap
11. Sauvage Shower Wash
12. Sauvage After-Shave Lotion
13. Sauvage Shaving Gel
14. Cashmere Mist Deodorant
15. Alpha Beta Exfoliating Body Treatment Peel
16. Kamili Cream Body Cleanser
17. Sugared Koffie Almond Milk Body Scrub
18. Scrubbi Bamboes  Body Cleanser
19. Buff Ryder Exfoliating Body Scrub With Superfine Sand + Fruit Enzymes
20. KP Bump Eraser Body Scrub with 10% AHA


In [4]:

# this part is the content collaborative filtering 
# Build feature matrix, the attribute we are comparing for contedn based
features = sub[['product_name', 'price_usd', 'rating', 'loves_count', 'reviews', 'online_only']]
# keep first row per product
# dropping duplicartes of product name
features = features.drop_duplicates(subset='product_name')  
features.set_index('product_name', inplace=True)

# Standardize the features
scaled = StandardScaler().fit_transform(features)
scaled_df = pd.DataFrame(scaled, index=features.index, columns=features.columns)

# Cosine similarity for content base CF
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Content based comparison within the products, and calculating the cosine similarity
vec = scaled_df.loc[chosen_product].values
content_sim = {}
for name in scaled_df.index:
    if name != chosen_product:
        content_sim[name] = cos_sim(vec, scaled_df.loc[name].values)

In [5]:
# simulate 50 users for user-rating
users = 50
# The list of all unique products in this subcategory
items = sub['product_name'].unique()
#total number of products
n_items = len(items)
# the index in the matrix
item_index = {p: i for i, p in enumerate(items)}
# storing the user rating
matrix = np.zeros((users, n_items))

# Reproductivity purpose
rng = np.random.default_rng(42)

popularity = sub.groupby('product_name')['loves_count'].mean().reindex(items)
popularity /= popularity.max()

for u in range(users):
    # All users rate all products
    # 1 value per item
    loves = popularity.values  
    # popularity-based base rating
    base_ratings = 2.5 + 2.5 * loves  
    # Adding some random variation noise, where mean is 0 and standar deviation is 0.8
    noise = rng.normal(0, 0.8, size=n_items)  
    # where the rating are between 1-5 stars
    ratings = np.clip(base_ratings + noise, 1, 5)
    matrix[u, :] = ratings


#  A Fucntion for Item-to-item cosine similarity in matrix calculation
def cosine_all(mat):
    # compute the length (norm) of each item vector (column)
    norm = np.linalg.norm(mat, axis=0)
    
    # compute dot products between all item pairs
    dot_products = np.dot(mat.T, mat)
    
    # compute outer product of norms to divide by
    norm_products = np.outer(norm, norm)
    
    # cosine similarity where the  it is equal to dot product / product of norms
    sim = dot_products / norm_products
    
    return sim

# we need to scale our data!
# Transpose and scale item vectors
matrix_T = matrix.T
scaler = StandardScaler()
matrix_scaled_T = scaler.fit_transform(matrix_T)
matrix_scaled = matrix_scaled_T.T


# calling the function
cf_sim = cosine_all(matrix_scaled)
cf_scores = cf_sim[item_index[chosen_product]]
# sort and get the top 5
cf_top_idx = np.argsort(cf_scores)[::-1][1:6]
cf_recs = pd.Series(cf_scores[cf_top_idx], index=[items[i] for i in cf_top_idx])

# Combining both
# w1 is the content CF weight
w1 = 0.7 
# w2 is the Item item cf weight
w2 = 0.3

# Adding weight for the final recom
combined = {}
for name in content_sim:
    if name in cf_recs:
        combined[name] = w1 * content_sim[name] + w2 * cf_recs[name]

top = pd.Series(combined).sort_values(ascending=False).head(5)



In [6]:
# Now create a data frame for a better output 
top_df = top.reset_index()
top_df.columns = ['product_name', 'similarity_score']

# Add product details
details = sub[['product_name', 'price_usd', 'rating', 'loves_count']]

# Merging
top_df = pd.merge(top_df, details, on='product_name')

# Show the final recommendation
print("\n The Top 5 Combined CF Recommendations for:", chosen_product, "are:")
top_df



 The Top 5 Combined CF Recommendations for: Armani Code After Shave Lotion are:


Unnamed: 0,product_name,similarity_score,price_usd,rating,loves_count
0,Sauvage Shower Wash,0.756542,50.0,4.5,4281
1,Hinoki Gentle Exfoliating Body Wash,0.484711,34.0,4.8059,10754
2,Charcoal Body Bar Massaging Soap,0.230542,16.0,4.72,3352
3,Cashmere Mist Deodorant,-0.085193,32.0,4.4439,45040
4,Wake the F*ck Up Everyday Body Wash,-0.145355,25.0,4.125,4567


In [9]:

#plotting an interative bar chart with ploty
fig = px.bar(top_df,
    x='product_name',
    y='similarity_score',
    title="Top 5 Recommended Products based on Content and Item-to-Item CF",
    labels={'product_name': 'Product', 'similarity_score': 'Cosine Similarity Score'},
    hover_data={'price_usd': True,'rating': True,'loves_count': True
    }
)
# make the graph taller
fig.update_layout(height=600)
#show it
fig.show()

#Save the plot as an HTML file
fig.write_html("top5_recommendations.html")
