In [2]:
import pandas as pd
import numpy as np
import random
from io import StringIO
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# --- 1. Data Parsing and Generation ---

def parse_and_generate_data(product_csv_path):
    print("Parsing and cleaning product data...")
    
    df = pd.read_csv(product_csv_path)


    # --- 1a. Clean Product Data ---
    
    # Add a unique product_id
    df['product_id'] = range(1001, 1001 + len(df))
    
    # Clean numerical features (handle NaNs, types)
    num_cols = ['product_rating', 'total_reviews', 'purchased_last_month', 
                'discounted_price', 'original_price', 'discount_percentage']
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Clean categorical features
    df['is_best_seller'] = df['is_best_seller'].apply(lambda x: "Best Seller" in str(x))
    df['is_sponsored'] = df['is_sponsored'].apply(lambda x: "Sponsored" in str(x))
    df['has_coupon'] = df['has_coupon'].apply(lambda x: "coupon" in str(x))
    
    # Fill NaNs in other key features
    for col in ['product_title', 'product_category', 'buy_box_availability', 'sustainability_tags']:
         if col in df.columns:
            df[col] = df[col].fillna("None")

    # Select only the features we'll actually use for the model
    features_to_keep = [
        'product_id', 'product_title', 'product_rating', 'total_reviews', 
        'purchased_last_month', 'discounted_price', 'original_price', 
        'is_best_seller', 'is_sponsored', 'has_coupon', 'buy_box_availability',
        'sustainability_tags', 'product_category', 'discount_percentage'
    ]
    products_df = df[features_to_keep].copy()

    # --- 1b. Generate User Interaction Data ---
    print("Generating mock user interaction data...")
    interactions = []
    n_users = 100
    n_interactions = 2000
    
    product_ids = products_df['product_id'].tolist()
    
    # Personas:
    apple_product_ids = products_df[products_df['product_title'].str.contains("Apple", na=False)]['product_id'].tolist()
    bargain_product_ids = products_df[
        (products_df['has_coupon'] == True) | (products_df['discount_percentage'] > 20)
    ]['product_id'].tolist()
    tech_product_ids = products_df[
        products_df['product_category'].isin(['Phones', 'Laptops', 'Other Electronics'])
    ]['product_id'].tolist()
    
    for i in range(n_interactions):
        user_id = random.randint(1, n_users)
        
        if 1 <= user_id <= 20 and apple_product_ids:
            p_id = random.choice(apple_product_ids)
            rating = random.randint(4, 5)
        elif 21 <= user_id <= 40 and bargain_product_ids:
            p_id = random.choice(bargain_product_ids)
            rating = random.randint(4, 5)
        elif 41 <= user_id <= 60 and tech_product_ids:
            p_id = random.choice(tech_product_ids)
            rating = random.randint(3, 5)
        else: # General shoppers
            p_id = random.choice(product_ids)
            rating = random.randint(1, 5)
            
        interactions.append({
            'user_id': user_id,
            'product_id': p_id,
            'rating': rating
        })
        
    interactions_df = pd.DataFrame(interactions).drop_duplicates(subset=['user_id', 'product_id'])
    
    print(f"Generated {len(products_df)} products.")
    print(f"Generated {len(interactions_df)} user interactions from {n_users} users.")
    
    return products_df, interactions_df

# --- 2. The Hybrid Recommender Class ---

class HybridRecommender:
    
    def __init__(self, n_neighbors=50, n_candidates=50, n_factors=50):
        self.n_neighbors = n_neighbors
        self.n_candidates = n_candidates
        self.n_factors = n_factors
        
        # Content model
        self.content_pipeline = None
        self.content_knn = None
        self.product_df = None
        
        # Collaborative model
        self.svd_model = None
        self.user_factors = None
        self.item_factors = None
        self.user_id_map = None
        self.product_id_map = None

    def fit_content_model(self, products_df):
        print("Fitting content-based model...")
        self.product_df = products_df.copy().reset_index(drop=True)
        
        text_features = ['product_title', 'product_category']
        numerical_features = ['product_rating', 'total_reviews', 'purchased_last_month', 
                              'discounted_price', 'original_price', 'discount_percentage']
        categorical_features = ['is_best_seller', 'is_sponsored', 'has_coupon', 
                                'buy_box_availability', 'sustainability_tags']
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('title', TfidfVectorizer(stop_words='english', max_features=50), text_features[0]),
                ('category', TfidfVectorizer(), text_features[1]),
                ('num', StandardScaler(), numerical_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ],
            remainder='drop'
        )
        
        self.content_pipeline = Pipeline([
            ('preprocessor', preprocessor),
        ])

        X_content = self.content_pipeline.fit_transform(self.product_df)
        
        self.content_knn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='cosine')
        self.content_knn.fit(X_content)
        print("Content-based model fitted.")

    def fit_collaborative_model(self, interactions_df):
        print("Fitting collaborative model (sklearn TruncatedSVD)...")
        
        # Create mappings for user/product IDs to matrix indices
        user_ids = interactions_df['user_id'].unique()
        product_ids = interactions_df['product_id'].unique()
        
        self.user_id_map = {user_id: i for i, user_id in enumerate(user_ids)}
        self.product_id_map = {product_id: i for i, product_id in enumerate(product_ids)}
        
        # Create copies of maps for inverse mapping
        self.product_idx_map = {i: product_id for product_id, i in self.product_id_map.items()}

        # Map the IDs to indices
        interactions_df_copy = interactions_df.copy()
        interactions_df_copy['user_index'] = interactions_df_copy['user_id'].map(self.user_id_map)
        interactions_df_copy['product_index'] = interactions_df_copy['product_id'].map(self.product_id_map)

        # Create the sparse user-item matrix
        user_item_matrix = csr_matrix(
            (interactions_df_copy['rating'], (interactions_df_copy['user_index'], interactions_df_copy['product_index'])),
            shape=(len(user_ids), len(product_ids))
        )
        
        # Fit the TruncatedSVD (Matrix Factorization)
        self.svd_model = TruncatedSVD(n_components=self.n_factors, random_state=42)
        
        # Get the user factor matrix
        self.user_factors = self.svd_model.fit_transform(user_item_matrix)
        
        # Get the item factor matrix
        self.item_factors = self.svd_model.components_.T
        
        print("Collaborative model fitted.")

    def get_recommendations(self, user_id, product_id, n_recs=10):
        if not all([self.content_pipeline, self.content_knn, self.svd_model]):
            raise RuntimeError("Models are not fitted. Call fit_content_model() and fit_collaborative_model() first.")
            
        # --- 1. FILTER (Content-Based) ---
        product_index = self.product_df[self.product_df['product_id'] == product_id].index[0]

        product_features = self.content_pipeline.transform(self.product_df.iloc[[product_index]])
        distances, indices = self.content_knn.kneighbors(product_features, n_neighbors=self.n_candidates)
        candidate_indices = indices[0][1:]
        candidate_product_ids = self.product_df.iloc[candidate_indices]['product_id'].tolist()
        
        # --- 2. RANK (Collaborative) ---
        
        # Check if user is in the collaborative model
        if user_id not in self.user_id_map:
            print(f"Warning: User {user_id} not in interaction data. Cannot personalize ranking.")
            # Fallback: return top content-based recs
            ranked_candidates = [(pid, -1) for pid in candidate_product_ids] # -1 as dummy rating
        else:
            ranked_candidates = []
            
            # Get the user's factor vector
            user_matrix_index = self.user_id_map[user_id]
            user_vec = self.user_factors[user_matrix_index]
            
            for p_id in candidate_product_ids:
                # Check if product is in the collaborative model
                if p_id in self.product_id_map:
                    # Get product's factor vector
                    product_matrix_index = self.product_id_map[p_id]
                    item_vec = self.item_factors[product_matrix_index]
                    
                    # Predict rating via dot product
                    pred_rating = np.dot(user_vec, item_vec)
                    ranked_candidates.append((p_id, pred_rating))
                else:
                    # Product has no ratings, give it a neutral score
                    ranked_candidates.append((p_id, 0))
                    
        # Sort candidates by their predicted rating
        ranked_candidates.sort(key=lambda x: x[1], reverse=True)
        
        # --- 3. RECOMMEND ---
        
        final_recs_data = []
        for p_id, predicted_rating in ranked_candidates[:n_recs]:
            product_details = self.product_df[self.product_df['product_id'] == p_id].iloc[0]
            final_recs_data.append({
                'product_id': p_id,
                'title': product_details['product_title'],
                'category': product_details['product_category'],
                'predicted_rating_for_user': round(predicted_rating, 2)
            })
            
        return pd.DataFrame(final_recs_data)

# --- Main Execution ---
if __name__ == "__main__":
    
    # This assumes a file named "products.csv" is in the same directory
    # and has headers matching the original data.
    products_df, interactions_df = parse_and_generate_data("/mnt/10EE4B76EE4B5360/College/pccoe/7th Sem/RS/RS-A5_amazon_products_sales_data_cleaned.csv")
    
    recommender = HybridRecommender()
    recommender.fit_content_model(products_df)
    recommender.fit_collaborative_model(interactions_df)

    print("\n" + "="*50 + "\n")
    
    # --- INFERENCE IS CALLED HERE ---
    
    # --- TEST CASE 1: Tech Fan (User 42) ---
    TEST_USER_ID = 42
    # Find the "Apple AirPods Pro 2"
    try:
        TEST_PRODUCT_ID = products_df[products_df['product_title'].str.contains("AirPods Pro 2", na=False)]['product_id'].values[0]
        
        print(f"--- Recommendations for User {TEST_USER_ID} (Tech Fan) ---")
        print(f"Viewing Product: {products_df[products_df['product_id'] == TEST_PRODUCT_ID]['product_title'].values[0]}\n")
        
        recs_1 = recommender.get_recommendations(TEST_USER_ID, TEST_PRODUCT_ID)
        print(recs_1)
    
    except IndexError:
        print("Could not find test product 'Apple AirPods Pro 2' in the CSV.")

    print("\n" + "="*50 + "\n")

    # --- TEST CASE 2: Bargain Hunter (User 22) ---
    TEST_USER_ID_2 = 22
    # Find the "BOYA BOYALINK" (has a coupon)
    try:
        TEST_PRODUCT_ID_2 = products_df[products_df['product_title'].str.contains("BOYA BOYALINK", na=False)]['product_id'].values[0]

        print(f"--- Recommendations for User {TEST_USER_ID_2} (Bargain Hunter) ---")
        print(f"Viewing Product: {products_df[products_df['product_id'] == TEST_PRODUCT_ID_2]['product_title'].values[0]}\n")
        
        recs_2 = recommender.get_recommendations(TEST_USER_ID_2, TEST_PRODUCT_ID_2)
        print(recs_2)

    except IndexError:
        print("Could not find test product 'BOYA BOYALINK' in the CSV.")
        
    print("\n" + "="*50 + "\n")

Parsing and cleaning product data...
Generating mock user interaction data...
Generated 31959 products.
Generated 1991 user interactions from 100 users.
Fitting content-based model...
Content-based model fitted.
Fitting collaborative model (sklearn TruncatedSVD)...
Collaborative model fitted.


--- Recommendations for User 42 (Tech Fan) ---
Viewing Product: Apple AirPods Pro 2 Wireless Earbuds, Active Noise Cancellation, Hearing Aid Feature, Bluetooth Headphones, Transparency, Personalized Spatial Audio, High-Fidelity Sound, H2 Chip, USB-C Charging

   product_id                                              title  \
0        1024  Apple EarPods Headphones with USB-C Plug, Wire...   
1        1007  Apple AirPods 4 Wireless Earbuds, Bluetooth He...   
2        1569  Anker 6 ft Premium Double-Braided Nylon Lightn...   
3        1053  Beats Solo 4 - Wireless Bluetooth On-Ear Headp...   
4        1008  Apple AirTag. Keep Track of and find Your Keys...   
5        1087  Logitech H390 Gaming 