<a href="https://colab.research.google.com/github/Kshireen/ML/blob/main/content_based_reommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# Setting a random seed for reproducibility
np.random.seed(42)

# Define product categories and attributes
categories = ['Electronics', 'Clothing', 'Books', 'Home Decor', 'Beauty', 'Toys', 'Sports']
num_products = 1000
num_rows = 10000

# Generate synthetic data
data = {
    'product_id': np.random.randint(1, num_products + 1, num_rows),
    'category': np.random.choice(categories, num_rows),
    'price': np.round(np.random.uniform(10, 500, num_rows), 2),
    'rating': np.round(np.random.uniform(1, 5, num_rows), 1),
    'num_reviews': np.random.randint(1, 1000, num_rows),
    'brand': np.random.choice(['BrandA', 'BrandB', 'BrandC', 'BrandD'], num_rows),
    'size': np.random.choice(['S', 'M', 'L', 'XL'], num_rows),
    'color': np.random.choice(['Red', 'Blue', 'Green', 'Black', 'White'], num_rows),
}

# Create DataFrame
df = pd.DataFrame(data)

# Display sample
df.head()


Unnamed: 0,product_id,category,price,rating,num_reviews,brand,size,color
0,103,Clothing,28.83,3.6,162,BrandC,M,White
1,436,Sports,206.83,1.8,929,BrandB,M,Red
2,861,Books,401.55,4.7,178,BrandA,M,White
3,271,Toys,266.12,3.6,444,BrandC,L,Blue
4,107,Electronics,279.52,3.5,328,BrandD,L,Green


In [3]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Scaling numerical features
numerical_features = ['price', 'rating', 'num_reviews']
scaler = MinMaxScaler()

# Encoding categorical features
categorical_features = ['category', 'brand', 'size', 'color']
encoder = OneHotEncoder(sparse_output=False)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_features),
        ('cat', encoder, categorical_features)
    ]
)

# Fit and transform data
processed_data = preprocessor.fit_transform(df)

# Convert to DataFrame
processed_df = pd.DataFrame(processed_data, columns=numerical_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out()))
processed_df.head()


Unnamed: 0,price,rating,num_reviews,category_Beauty,category_Books,category_Clothing,category_Electronics,category_Home Decor,category_Sports,category_Toys,...,brand_BrandD,size_L,size_M,size_S,size_XL,color_Black,color_Blue,color_Green,color_Red,color_White
0,0.038432,0.65,0.161323,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.401735,0.2,0.92986,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.799163,0.925,0.177355,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.522747,0.65,0.443888,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.550097,0.625,0.327655,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(processed_df)

# Function to recommend products based on product_id
def recommend(product_id, num_recommendations=5):
    # Get the index of the product
    product_idx = df[df['product_id'] == product_id].index[0]

    # Get similarity scores for all products with the given product
    similarity_scores = list(enumerate(similarity_matrix[product_idx]))

    # Sort products by similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top N recommendations
    recommendations = [df.iloc[i[0]]['product_id'] for i in similarity_scores[1:num_recommendations + 1]]
    return recommendations

# Example: Recommend products similar to product_id = 10
recommendations = recommend(10)
print("Recommended products:", recommendations)


Recommended products: [361, 868, 580, 987, 458]


# New Section

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity


In [6]:
# Generate synthetic data or load real data
# Using the same data generation as before
data = {
    'product_id': np.random.randint(1, 1001, 10000),
    'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home Decor', 'Beauty', 'Toys', 'Sports'], 10000),
    'price': np.round(np.random.uniform(10, 500, 10000), 2),
    'rating': np.round(np.random.uniform(1, 5, 10000), 1),
    'num_reviews': np.random.randint(1, 1000, 10000),
    'brand': np.random.choice(['BrandA', 'BrandB', 'BrandC', 'BrandD'], 10000),
    'size': np.random.choice(['S', 'M', 'L', 'XL'], 10000),
    'color': np.random.choice(['Red', 'Blue', 'Green', 'Black', 'White'], 10000),
}

df = pd.DataFrame(data)


In [8]:
# Define preprocessing pipeline
numerical_features = ['price', 'rating', 'num_reviews']
categorical_features = ['category', 'brand', 'size', 'color']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False), categorical_features)
    ]
)

# Fit and transform data
processed_data = preprocessor.fit_transform(df)
processed_df = pd.DataFrame(processed_data)


In [9]:
# Calculate similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(processed_df)

In [10]:
def recommend(product_id, num_recommendations=5):
    # Locate the index of the given product ID
    product_idx = df[df['product_id'] == product_id].index[0]

    # Retrieve similarity scores for the product
    similarity_scores = list(enumerate(similarity_matrix[product_idx]))

    # Sort by similarity score (from high to low)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top recommendations (excluding the product itself)
    recommended_indices = [i[0] for i in similarity_scores[1:num_recommendations + 1]]

    # Return recommended product IDs
    return df.iloc[recommended_indices][['product_id', 'category', 'price', 'brand']]

# Example: Generate recommendations for a sample product
print(recommend(10))


      product_id     category   price   brand
4708         982  Electronics  284.91  BrandC
5183         985  Electronics  143.60  BrandC
4775          97  Electronics  170.34  BrandC
8164         929  Electronics  159.34  BrandC
2519         914  Electronics  294.58  BrandC


In [12]:
!pip install ipywidgets


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [13]:
import ipywidgets as widgets
from IPython.display import display

# Create interactive widgets
product_id_widget = widgets.IntSlider(value=10, min=1, max=1000, step=1, description='Product ID:')
num_recs_widget = widgets.IntSlider(value=5, min=1, max=10, step=1, description='Recommendations:')

# Define a function to display recommendations
def display_recommendations(product_id, num_recommendations):
    recommendations = recommend(product_id, num_recommendations)
    print(f"Recommendations for Product ID {product_id}:\n")
    display(recommendations)

# Display the widgets and link them to the function
ui = widgets.VBox([product_id_widget, num_recs_widget])
out = widgets.interactive_output(display_recommendations, {'product_id': product_id_widget, 'num_recommendations': num_recs_widget})
display(ui, out)


VBox(children=(IntSlider(value=10, description='Product ID:', max=1000, min=1), IntSlider(value=5, description…

Output()

In [11]:
from sklearn.pipeline import Pipeline

# Create a full pipeline that preprocesses and calculates similarity
recommendation_pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

# Transform data with pipeline
processed_data_pipeline = recommendation_pipeline.fit_transform(df)

# Recalculate similarity matrix based on processed pipeline data
similarity_matrix_pipeline = cosine_similarity(processed_data_pipeline)

# Define a function to use this pipeline data
def recommend_pipeline(product_id, num_recommendations=5):
    product_idx = df[df['product_id'] == product_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix_pipeline[product_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommended_indices = [i[0] for i in similarity_scores[1:num_recommendations + 1]]
    return df.iloc[recommended_indices][['product_id', 'category', 'price', 'brand']]

# Example: Get recommendations using the pipeline
print(recommend_pipeline(10))


      product_id     category   price   brand
4708         982  Electronics  284.91  BrandC
5183         985  Electronics  143.60  BrandC
4775          97  Electronics  170.34  BrandC
8164         929  Electronics  159.34  BrandC
2519         914  Electronics  294.58  BrandC
