# Preparing the data
we import all necessary modules and read the ratings.dat into a pandas DataFrame object :

In [134]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report)
import matplotlib.pyplot as plt


# 2. Define the MovieRecommender Class


In [192]:
class MovieRecommender:
    def __init__(self, ratings_path, movies_path, users_path, rating_threshold=3):
        """
        Initialize MovieRecommender with ratings, movies, and users data file paths.
        
        Parameters:
        -----------
        ratings_path : str
            Path to ratings.dat file.
        movies_path : str
            Path to movies.dat file.
        users_path : str
            Path to users.dat file.
        rating_threshold : int, default=3
            Ratings above this value are considered positive recommendations.
        """
        self.rating_threshold = rating_threshold
        self.df = self._load_ratings(ratings_path)
        self.movies_df = self._load_movies(movies_path)
        self.users_df = self._load_users(users_path)
        self.n_users = self.df['user_id'].nunique()
        self.n_movies = self.df['movie_id'].nunique()
    def prepare_movie_data(self, target_movie_id):
        
        """
         Prepare training data for a specific movie.
    
         Parameters:
         -----------
         target_movie_id : int
        ID of the movie to predict ratings for
        
        Returns:
        --------
        tuple : (X, Y) feature matrix and target vector
         """
        # Create user-movie rating matrix
        data, movie_mapping = self._create_rating_matrix()
    
        # Prepare features and target
        X_raw = np.delete(data, movie_mapping[target_movie_id], axis=1)
        Y_raw = data[:, movie_mapping[target_movie_id]]
    
        # Filter out users who haven't rated the target movie
        X = X_raw[Y_raw > 0]
        Y = Y_raw[Y_raw > 0]
    
        # Convert ratings to binary labels
        Y = (Y > self.rating_threshold).astype(int)
      
        return X, Y   
    def _load_ratings(self, path):
        """Load and preprocess ratings data."""
        df = pd.read_csv(path, header=None, sep='::', engine='python', encoding='ISO-8859-1')
        df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
        return df

    def _load_movies(self, path):
        """Load and preprocess movies data."""
        df = pd.read_csv(path, header=None, sep='::', engine='python', encoding='ISO-8859-1')
        df.columns = ['movie_id', 'title', 'genres']
        return df

    def _load_users(self, path):
        """Load and preprocess users data."""
        df = pd.read_csv(path, header=None, sep='::', engine='python', encoding='ISO-8859-1')
        df.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
        return df
    def _create_rating_matrix(self):
        """Create user-movie rating matrix and movie ID mapping."""
        data = np.zeros([self.n_users, self.n_movies], dtype=np.intc)
        movie_mapping = {}
        
        for user_id, movie_id, rating in zip(self.df['user_id'], self.df['movie_id'], self.df['rating']):
            user_idx = int(user_id) - 1
            if movie_id not in movie_mapping:
                movie_mapping[movie_id] = len(movie_mapping)
            data[user_idx, movie_mapping[movie_id]] = rating
            
        return data, movie_mapping
    def tune_hyperparameters(self, X, Y, k=5):
        """
        Perform k-fold cross-validation to tune MultinomialNB hyperparameters.
    
        Parameters:
        -----------
        X : array-like
        Feature matrix
        Y : array-like
        Target vector
        k : int, default=5
        Number of folds for cross-validation
        
        Returns:
        --------
            dict : Best parameters and their performance
        """
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        alphas = [1, 2, 3, 4, 5, 6]
        fit_priors = [True, False]
    
        auc_record = {}
    
    # Perform k-fold CV for each parameter combination
        for train_idx, test_idx in k_fold.split(X, Y):
            
            X_train, X_test = X[train_idx], X[test_idx]
            Y_train, Y_test = Y[train_idx], Y[test_idx]
        
            for alpha in alphas:
                if alpha not in auc_record:
                    auc_record[alpha] = {}
                
                for fit_prior in fit_priors:
                    clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
                    clf.fit(X_train, Y_train)
                    prob = clf.predict_proba(X_test)[:, 1]
                    auc = roc_auc_score(Y_test, prob)
                    auc_record[alpha][fit_prior] = auc + auc_record[alpha].get(fit_prior, 0.0)
    
    # Find best parameters
        best_auc = 0
        best_params = {}
    
        for alpha, prior_dict in auc_record.items():
            for fit_prior, auc_sum in prior_dict.items():
                mean_auc = auc_sum / k
                if mean_auc > best_auc:
                    best_auc = mean_auc
                    best_params = {'alpha': alpha, 'fit_prior': fit_prior}
    
        return {'best_params': best_params, 'best_auc': best_auc}
        
    def train_and_evaluate(self, X, Y, test_size=0.2, **model_params):
        """Train and evaluate the model with given parameters."""
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
        clf = MultinomialNB(**model_params)
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        Y_prob = clf.predict_proba(X_test)[:, 1]
        
        metrics = {
            'accuracy': clf.score(X_test, Y_test),
            'precision': precision_score(Y_test, Y_pred),
            'recall': recall_score(Y_test, Y_pred),
            'f1': f1_score(Y_test, Y_pred),
            'auc': roc_auc_score(Y_test, Y_prob),
            'confusion_matrix': confusion_matrix(Y_test, Y_pred),
            'classification_report': classification_report(Y_test, Y_pred)
        }
        
        return metrics
            

# 3. Define Core Functionalities


# 4. Implement New Functionalities


In [172]:
    def recommend_by_demographics(self, user_id):
        """Recommend movies based on user demographics."""
        user_details = self.users_df[self.users_df['user_id'] == user_id]
        if user_details.empty:
            print("User not found.")
            return []
        
        user_gender = user_details['gender'].values[0]
        user_age = user_details['age'].values[0]
        similar_users = self.users_df[(self.users_df['gender'] == user_gender) & (self.users_df['age'] == user_age)]
        
        similar_user_ids = similar_users['user_id'].unique()
        recommended_movies = self.df[(self.df['user_id'].isin(similar_user_ids)) & (self.df['rating'] > self.rating_threshold)]
        
        return recommended_movies['movie_id'].unique().tolist()
    
    def recommend_by_genre(self, target_movie_id):
        """Recommend movies based on genre similarity to a target movie."""
        target_movie = self.movies_df[self.movies_df['movie_id'] == target_movie_id]
        if target_movie.empty:
            print("Movie not found.")
            return []
        
        target_genre = target_movie['genres'].values[0]
        genre_movies = self.movies_df[self.movies_df['genres'] == target_genre]
        genre_movie_ids = genre_movies['movie_id'].unique()
        recommended_movies = self.df[(self.df['movie_id'].isin(genre_movie_ids)) & (self.df['rating'] > self.rating_threshold)]
        
        return recommended_movies['movie_id'].unique().tolist()
    
    def cold_start_recommendation(self, user_id=None, movie_id=None):
        """Provide recommendations for new users or movies with limited data."""
        if user_id:
            user_details = self.users_df[self.users_df['user_id'] == user_id]
            if user_details.empty:
                print("User not found.")
                return []
            
            popular_movies = (self.df[self.df['rating'] > self.rating_threshold]
                              .groupby('movie_id').size().sort_values(ascending=False))
            return popular_movies.index.tolist()[:10]  # Top 10 popular movies

        elif movie_id:
            genre = self.movies_df[self.movies_df['movie_id'] == movie_id]['genres'].values[0]
            similar_users = self.users_df[self.users_df['user_id'].isin(self.df[self.df['movie_id'] == movie_id]['user_id'])]
            return similar_users['user_id'].unique().tolist()
        
        print("Specify either user_id or movie_id for recommendations.")
        return []

    def evaluate_by_demographics_and_genre(self, X, Y, demographics=True, genre=True):
        """Evaluate model performance by demographics and genre."""
        metrics = {}
        
        if demographics:
            age_groups = self.users_df['age'].unique()
            metrics['age'] = {}
            for age in age_groups:
                user_ids = self.users_df[self.users_df['age'] == age]['user_id']
                indices = self.df['user_id'].isin(user_ids)
                X_age, Y_age = X[indices], Y[indices]
                metrics['age'][age] = self.train_and_evaluate(X_age, Y_age)
                
        if genre:
            genres = self.movies_df['genres'].unique()
            metrics['genres'] = {}
            for genre in genres:
                movie_ids = self.movies_df[self.movies_df['genres'] == genre]['movie_id']
                indices = self.df['movie_id'].isin(movie_ids)
                X_genre, Y_genre = X[indices], Y[indices]
                metrics['genres'][genre] = self.train_and_evaluate(X_genre, Y_genre)
                
        return metrics


# 5. Add Model Training, Hyperparameter Tuning, and Evaluation Methods


the target movie is ID and we will treat ratings of other movies as features .we only use rows
with ratings available for the target movie so we can validate how good the prediction is we construct the dataset accordingly as follows

In [194]:
# Initialize recommender
recommender = MovieRecommender('ml-1m/ratings.dat', 'ml-1m/movies.dat', 'ml-1m/users.dat')

# Prepare data for a specific movie
target_movie_id = 2858
X, Y = recommender.prepare_movie_data(target_movie_id)

# Tune hyperparameters
best_params = recommender.tune_hyperparameters(X, Y)
print(f"Best parameters: {best_params}")

# Train and evaluate model
metrics = recommender.train_and_evaluate(X, Y, **best_params['best_params'])
print(f"Model performance:\n{metrics}")


Best parameters: {'best_params': {'alpha': 6, 'fit_prior': True}, 'best_auc': 0.654779124770733}
Model performance:
{'accuracy': 0.7551020408163265, 'precision': 0.889943074003795, 'recall': 0.8100172711571675, 'f1': 0.8481012658227848, 'auc': 0.6806288638161186, 'confusion_matrix': array([[ 49,  58],
       [110, 469]], dtype=int64), 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.31      0.46      0.37       107\n           1       0.89      0.81      0.85       579\n\n    accuracy                           0.76       686\n   macro avg       0.60      0.63      0.61       686\nweighted avg       0.80      0.76      0.77       686\n'}
