In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder


In [2]:
class ContentBasedRecommendationModel:
    def __init__(self, customers_file, products_file, ratings_file, n_neighbors=5, metric='euclidean'):
        self.customers_file = customers_file
        self.products_file = products_file
        self.ratings_file = ratings_file
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.label_encoder = LabelEncoder()
        self.knn_model = None
        self.merged_df = None

    def load_data(self):
        customers_df = pd.read_json(self.customers_file)
        products_df = pd.read_json(self.products_file)
        ratings_df = pd.read_json(self.ratings_file)
        customers_df.rename(columns={'Id': 'CustomerID'}, inplace=True)
        products_df.rename(columns={'Id': 'ProductID'}, inplace=True)
 
        # Merge ratings data with customer and product dataCustomerID
        self.merged_df = ratings_df.merge(customers_df, on='CustomerID')
        self.merged_df = self.merged_df.merge(products_df, on='ProductID')
        # Encode categorical features like 'age' and 'region'
        self.merged_df['Age'] = self.label_encoder.fit_transform(self.merged_df['Age'])
        self.merged_df['Region'] = self.label_encoder.fit_transform(self.merged_df['Region'])
        self.merged_df['Category'] = self.label_encoder.fit_transform(self.merged_df['Category'])
        self.merged_df['genre'] = self.label_encoder.fit_transform(self.merged_df['genre'])
        print(self.merged_df['genre'])
    def train_model(self):
        X = self.merged_df[['Category', 'Age', 'Region','genre']]
        self.knn_model = NearestNeighbors(n_neighbors=self.n_neighbors, metric=self.metric, algorithm='brute')
        self.knn_model.fit(X)

    def get_recommendations(self, product_features, num_recommendations=5):
        if self.knn_model is None:
            raise Exception("Model has not been trained. Call train_model() first.")
        distances, indices = self.knn_model.kneighbors([product_features], n_neighbors=num_recommendations)
        recommended_products = []
        for index in indices[0]:
            recommended_products.append(self.merged_df.iloc[index]['ProductID'])
        return recommended_products

    def evaluate_recommendations(self, true_product_id, recommended_products, K):
        hit = true_product_id in recommended_products
        precision_at_K = len(set(recommended_products) & set([true_product_id])) / K
        recall_at_K = len(set(recommended_products) & set([true_product_id])) / 1 if hit else 0
        return hit, precision_at_K, recall_at_K

    def calculate_mrr(self, hit):
        if hit:
            return 1
        else:
            return 0

    def calculate_ndcg(self, recommended_products, true_product_id):
        if true_product_id in recommended_products:
            true_position = recommended_products.index(true_product_id) + 1
            return 1 / np.log2(true_position + 1)
        else:
            return 0

In [4]:
if __name__ == "__main__":
    # Example usage of the ContentBasedRecommendationModel class
    customers_file = 'input/dataset/customers.json'
    products_file = 'input/dataset/products.json'
    ratings_file = 'input/dataset/ratings.json'

    model = ContentBasedRecommendationModel(customers_file, products_file, ratings_file)
    model.load_data()
    model.train_model()

    # Example: Get recommendations for a specific product (provide its features)
    product_features_to_recommend_for = [2, 30, 3,1]
    recommended_products = model.get_recommendations(product_features_to_recommend_for)

    # Evaluate recommendations (replace these with actual values)
    true_product_id = 1  # Replace with the actual product ID you want to evaluate
    K = 5  # Top K recommendations
    hit, precision_at_K, recall_at_K = model.evaluate_recommendations(true_product_id, recommended_products, K)
    mrr = model.calculate_mrr(hit)
    ndcg = model.calculate_ndcg(recommended_products, true_product_id)

    # Display evaluation results
    print("Hit Rate:", hit)
    print("Precision at K:", precision_at_K)
    print("Recall at K:", recall_at_K)
    print("Mean Reciprocal Rank (MRR):", mrr)
    print("Normalized Discounted Cumulative Gain (NDCG):", ndcg)


0        0
1        1
2        0
3        1
4        0
        ..
99995    1
99996    0
99997    1
99998    0
99999    0
Name: genre, Length: 100000, dtype: int32
Hit Rate: False
Precision at K: 0.0
Recall at K: 0
Mean Reciprocal Rank (MRR): 0
Normalized Discounted Cumulative Gain (NDCG): 0


