In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [31]:
df = pd.read_csv("test_data\projectData.csv")

In [32]:
df.head()

Unnamed: 0,ProductID,ProductName,category,subcategory,description,price,discount,discounted_price,Rating,reviews_count,images,tags,available,gender,comment
0,1,Men's Classic Cotton T-Shirt,Clothing,T-Shirts,Premium 100% cotton t-shirt with comfortable f...,2500,10,2250,4.3,156,,"t-shirt, cotton, casual, men",True,male,
1,2,Women's Floral Print T-Shirt,Clothing,T-Shirts,Stylish floral print t-shirt made from soft co...,2200,15,1870,4.5,89,,"t-shirt, floral, women, casual",True,female,
2,3,Men's Striped Polo T-Shirt,Clothing,T-Shirts,Classic striped polo t-shirt with collar and t...,3200,20,2560,4.2,234,,"polo, striped, men, casual",True,male,
3,4,Women's V-Neck Basic T-Shirt,Clothing,T-Shirts,Essential v-neck t-shirt in soft jersey fabric...,1800,12,1584,4.4,312,,"v-neck, basic, women, essential",True,female,
4,5,Men's Graphic Print T-Shirt,Clothing,T-Shirts,Trendy graphic print t-shirt with modern desig...,2700,18,2214,4.1,128,,"graphic, print, men, trendy",True,male,


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ProductID         79 non-null     int64  
 1   ProductName       79 non-null     object 
 2   category          79 non-null     object 
 3   subcategory       79 non-null     object 
 4   description       79 non-null     object 
 5   price             79 non-null     int64  
 6   discount          79 non-null     int64  
 7   discounted_price  79 non-null     int64  
 8   Rating            79 non-null     float64
 9   reviews_count     79 non-null     int64  
 10  images            50 non-null     object 
 11  tags              79 non-null     object 
 12  available         79 non-null     object 
 13  gender            79 non-null     object 
 14  comment           0 non-null      float64
dtypes: float64(2), int64(5), object(8)
memory usage: 9.4+ KB


In [34]:
df.isna().count()

ProductID           79
ProductName         79
category            79
subcategory         79
description         79
price               79
discount            79
discounted_price    79
Rating              79
reviews_count       79
images              79
tags                79
available           79
gender              79
comment             79
dtype: int64

In [35]:
df.fillna('', inplace=True)
df['text'] = df['tags'] + ' ' + df['category'] + ' ' + df['subcategory'] + ' ' + df['description']

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(df['text'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_content(product_id, top_n=5):
    if product_id not in df['ProductID'].values:
        return f"Product ID {product_id} not found in the dataset"
    
    index = df[df['ProductID'] == product_id].index[0]
    sim_score = list(enumerate(cosine_sim[index]))
    sim_score = sorted(sim_score, key=lambda x : x[1], reverse=True)[1 : top_n+1]
    similar_indices = [i[0] for i in sim_score]
    print(similar_indices)
    return df.iloc[similar_indices][['ProductID', 'ProductName', 'category', 'Rating']] 

In [37]:
def recommend_popular(top_n=5, by='rating'):
    if by not in df.columns:
        return f"Column '{by}' not found in dataset."
    return df.sort_values(by=by, ascending=False).head(top_n)[['ProductID', 'ProductName', 'category', by]]

In [38]:
def recommend_by_rule(product_id, price_margin=1000):
    product_rows = df[df['ProductID'] == product_id]

    if product_rows.empty:
        return f"Product ID {product_id} not found in the dataset."

    product = product_rows.iloc[0]

    similar = df[
        (df['category'] == product['category']) &
        (df['gender'] == product['gender']) &
        (abs(df['price'] - product['price']) <= price_margin) &
        (df['ProductID'] != product_id)
    ]

    if similar.empty:
        return f"No similar products found for Product ID {product_id}."

    return similar[['ProductID', 'ProductName', 'category', 'price', 'gender']].head(5)


In [39]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Step 1: Select features
features = df[['price', 'discount', 'Rating']]  # adjust as needed

# Step 2: Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Step 3: Apply clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)


  super()._check_params_vs_input(X, default_n_init=10)


In [40]:
def recommend_from_cluster(product_id, top_n=5):
    product_rows = df[df['ProductID'] == product_id]
    if product_rows.empty:
        return f"Product ID {product_id} not found."

    product = product_rows.iloc[0]
    cluster_group = df[
        (df['cluster'] == product['cluster']) & 
        (df['ProductID'] != product_id)
    ]

    if cluster_group.empty:
        return f"No similar products in cluster for Product ID {product_id}."

    return cluster_group[['ProductID', 'ProductName', 'category', 'cluster']].head(top_n)


In [41]:
valid_id = df['ProductID'].sample(1).values[0] 
print(recommend_content(valid_id, 5))
print(recommend_popular(5, by='Rating'))
print(recommend_by_rule(valid_id))
print(recommend_from_cluster(valid_id))

[67, 24, 12, 61, 11]
    ProductID                          ProductName  category  Rating
67         69                Women's Puffer Jacket  Clothing     4.6
24         26                  Women's Ankle Boots     Shoes     4.5
12         14  Women's Striped Long Sleeve T-Shirt  Clothing     4.4
61         63                   Men's Denim Jacket  Clothing     4.3
11         13         Men's Vintage Washed T-Shirt  Clothing     4.3
    ProductID              ProductName  category  Rating
64         66      Women's Trench Coat  Clothing     4.7
54         56      Women's Silk Blouse  Clothing     4.7
45         47       Women's Yoga Pants  Clothing     4.7
59         61     Men's Leather Jacket  Clothing     4.7
33         35  Men's Chronograph Watch   Watches     4.7
   ProductID                   ProductName  category  price gender
0          1  Men's Classic Cotton T-Shirt  Clothing   2500   male
2          3    Men's Striped Polo T-Shirt  Clothing   3200   male
4          5   Men's G

In [42]:
print(recommend_content(23))

[28, 0, 27, 52, 43]
    ProductID                   ProductName  category  Rating
28         30      Women's Slip-On Sneakers     Shoes     4.4
0           1  Men's Classic Cotton T-Shirt  Clothing     4.3
27         29              Men's Boat Shoes     Shoes     4.2
52         54             Men's Linen Shirt  Clothing     4.4
43         45             Men's Linen Pants  Clothing     4.1


In [43]:
import pickle

# Build your assets dictionary — include only what's needed for recommendations
recommendation_assets = {
    "df": df,
    "tfidf": tfidf,
    "tfidf_matrix": tfidf_matrix,
    "cosine_sim": cosine_sim,
    "kmeans": kmeans
}

# Save the pickle file in the desired location
with open("recommendation_models.pkl", "wb") as f:
    pickle.dump(recommendation_assets, f)

print("Pickle file saved at recommendation_models.pkl")


Pickle file saved at recommendation_models.pkl
