In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import seaborn as sns

In [54]:
class IFFARecommendationSystem:
    def __init__(self):
        self.movies_df = None
        self.ratings_df = None
        self.users_df = None
        self.content_matrix = None
        self.user_item_matrix = None
        self.svd_model = None
    def load_data( self, movies_path, ratings_path = None, users_path = None):
        '''
        load movies, ratings and user data
    
        movies_path (str): Path to movies CSV file
        ratings_path (str, optional): Path to ratings CSV file
        users_path (str, optional): Path to users CSV file
        '''
        # Load movies data (required)
        self.movies_df = pd.read_csv(movies_path)
        print(f"Loaded {len(self.movies_df)} movies")
        
        # Load ratings data if provided
        if ratings_path:
            self.ratings_df = pd.read_csv(ratings_path)
            print(f"Loaded {len(self.ratings_df)} ratings")
        
        # Load users data if provided
        if users_path:
            self.users_df = pd.read_csv(users_path)
            print(f"Loaded {len(self.users_df)} users")
        

In [56]:
def preprocess_data(self):
        """Preprocess the data for recommendation algorithms"""
        # Check if data is loaded
        if self.movies_df is None:
            raise ValueError("Data not loaded. Please call load_data first.")
        
        # Create a combined features column for content-based filtering
        # Assuming columns like 'genres', 'director', 'cast', 'keywords', etc.
        # Adjust based on your actual dataset columns
        features = []
        
        if 'genres' in self.movies_df.columns:
            features.append('genres')
        if 'director' in self.movies_df.columns:
            features.append('director')
        if 'cast' in self.movies_df.columns:
            features.append('cast')
        if 'keywords' in self.movies_df.columns:
            features.append('keywords')
        if 'tags' in self.movies_df.columns:
            features.append('tags')
        if 'description' in self.movies_df.columns:
            features.append('description')
            
        # If no suitable columns found, use title as fallback
        if not features and 'title' in self.movies_df.columns:
            features.append('title')
            
        if not features:
            raise ValueError("No suitable features found for content-based filtering")
            
        # Create combined features
        self.movies_df['combined_features'] = self.movies_df[features].apply(
            lambda row: ' '.join(row.values.astype(str)), axis=1
        )
        
        print("Data preprocessing complete")

In [58]:
def build_content_based_model(self):
        """Build a content-based recommendation model using TF-IDF"""
        if 'combined_features' not in self.movies_df.columns:
            self.preprocess_data()
            
        # Create TF-IDF matrix
        tfidf = TfidfVectorizer(stop_words='english')
        self.content_matrix = tfidf.fit_transform(self.movies_df['combined_features'])
        
        print(f"Content-based model built with shape: {self.content_matrix.shape}")
        
def get_content_based_recommendations(self, movie_id, n=10):
    if self.content_matrix is None:
        self.build_content_based_model()
            
        # Find the movie index
    movie_idx = self.movies_df[self.movies_df['movie_id'] == movie_id].index
    if len(movie_idx) == 0:
        raise ValueError(f"Movie ID {movie_id} not found")
    movie_idx = movie_idx[0]
        
    # Calculate similarity scores
    similarity_scores = cosine_similarity(
        self.content_matrix[movie_idx].reshape(1, -1), 
        self.content_matrix
        ).flatten()
        
    # Get top n similar movies
    similar_indices = similarity_scores.argsort()[::-1][1:n+1]  # Exclude the movie itself
        
    # Return recommended movies
    recommendations = self.movies_df.iloc[similar_indices].copy()
    recommendations['similarity_score'] = similarity_scores[similar_indices]
        
    return recommendations

In [60]:
def build_collaborative_filtering_model(self, n_components=50):
    """
        Build a collaborative filtering model using SVD"""
    if self.ratings_df is None:
        raise ValueError("Ratings data not loaded. Cannot build collaborative filtering model.")
            
        # Create user-item matrix
    user_item_df = self.ratings_df.pivot(
        index='user_id', 
        columns='movie_id', 
        values='rating'
    ).fillna(0)
    self.user_item_matrix = user_item_df.values
    self.user_ids = user_item_df.index.tolist()
    self.movie_ids_cf = user_item_df.columns.tolist()
        
        # Apply SVD
    self.svd_model = TruncatedSVD(n_components=n_components)
    self.user_features = self.svd_model.fit_transform(self.user_item_matrix)
    self.movie_features = self.svd_model.components_.T
        
    print(f"Collaborative filtering model built with {n_components} latent factors")

In [62]:
def get_collaborative_recommendations(self, user_id, n = 10):
    """get collaborative recommendation by filtering for a user
    Parameters:
        user_id: ID of the user to get recommendations for
        n (int): Number of recommendations to return
        
        Returns:
        DataFrame: Top n recommended movies for the user
    """
    if self.svd_model is None:
        raise ValueError("Collaborative filtering model not built")

    # Find user index
    if user_id not in self.user_ids:
        raise ValueError(f"User ID {user_id} not found")
    user_idx = self.user_ids.index(user_id)
        
    # Get already rated movies
    rated_movies = set(self.ratings_df[self.ratings_df['user_id'] == user_id]['movie_id'])
    #Calculate the predicted ratings 
    user_vector = self.user_features[user_idx].reshape(1,-1)
    predicted_ratings = np.dot(user_vector, self.movie_features.T)
    #sort the movies and get the top n unrated movies 
    movie_indices = predicted_ratings.argsort()[::-1]

    #Filter out already rated movies
    unrated_indices = [i for i in movie_indices if self.movie_ids_cf[i] not in rated_movies][:n]
        
    # Get recommendation movie IDs
    recommended_movie_ids = [self.movie_ids_cf[i] for i in unrated_indices]
    recommendation_scores = predicted_ratings[unrated_indices]
        
    # Get movie details
    recommendations = self.movies_df[self.movies_df['movie_id'].isin(recommended_movie_ids)].copy()
        
    # Add predicted rating
    movie_id_to_score = {movie_id: score for movie_id, score in zip(recommended_movie_ids, recommendation_scores)}
    recommendations['predicted_rating'] = recommendations['movie_id'].map(movie_id_to_score)
        
    # Sort by predicted rating
    recommendations = recommendations.sort_values('predicted_rating', ascending=False)


        


In [64]:
def get_hybrid_recommendations(self, user_id, content_weight=0.3, n=10):
    """
    Get hybrid recommendations combining content-based and collaborative filtering
        
    Parameters:
    user_id: ID of the user to get recommendations for
    content_weight (float): Weight for content-based recommendations (0-1)
    n (int): Number of recommendations to return
        
    Returns:
    DataFrame: Top n recommended movies using the hybrid approach
    """
    # Get user's highest rated movies
    if self.ratings_df is None:
        raise ValueError("Ratings data not loaded. Cannot build hybrid recommendations.")
            
    user_ratings = self.ratings_df[self.ratings_df['user_id'] == user_id]
    if len(user_ratings) == 0:
        raise ValueError(f"No ratings found for user ID {user_id}")
            
     # Get top rated movie for content-based recommendations
    top_rated_movie = user_ratings.sort_values('rating', ascending=False).iloc[0]['movie_id']
        
    # Get content-based recommendations from user's top movie
    content_recs = self.get_content_based_recommendations(top_rated_movie, n=50)
        
    # Get collaborative filtering recommendations
    collab_recs = self.get_collaborative_recommendations(user_id, n=50)
        
    # Combine the recommendations
    # Normalize the scores
    content_recs['norm_score'] = (content_recs['similarity_score'] - content_recs['similarity_score'].min()) / \
                                     (content_recs['similarity_score'].max() - content_recs['similarity_score'].min())
        
    collab_recs['norm_score'] = (collab_recs['predicted_rating'] - collab_recs['predicted_rating'].min()) / \
                                   (collab_recs['predicted_rating'].max() - collab_recs['predicted_rating'].min())
        
    # Create a set of all recommended movies
    all_movies = set(content_recs['movie_id']).union(set(collab_recs['movie_id']))
        
    # Calculate hybrid scores
    hybrid_scores = {}
    for movie_id in all_movies:
        content_score = content_recs[content_recs['movie_id'] == movie_id]['norm_score'].values
        content_score = content_score[0] if len(content_score) > 0 else 0
            
        collab_score = collab_recs[collab_recs['movie_id'] == movie_id]['norm_score'].values
        collab_score = collab_score[0] if len(collab_score) > 0 else 0
            
        # Weighted average
        hybrid_scores[movie_id] = content_weight * content_score + (1 - content_weight) * collab_score
        
    # Sort by hybrid score
    sorted_movies = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie_id for movie_id, _ in sorted_movies[:n]]
        
    # Get movie details
    recommendations = self.movies_df[self.movies_df['movie_id'].isin(top_movie_ids)].copy()
        
    # Add hybrid score
    recommendations['hybrid_score'] = recommendations['movie_id'].map(hybrid_scores)
        
    # Sort by hybrid score
    recommendations = recommendations.sort_values('hybrid_score', ascending=False)
        
    return recommendations

In [66]:
def get_trending_recommendations(self, timeframe='month', n=10):
    """
     Get trending movies based on recent popularity
        
    Parameters:
    timeframe (str): Time frame for trending ('week', 'month', 'year')
    n (int): Number of recommendations to return
        
    Returns:
    DataFrame: Top n trending movies
    """
    if self.ratings_df is None or 'timestamp' not in self.ratings_df.columns:
        raise ValueError("Ratings data with timestamps not available")
        
    # Convert timestamp to datetime
    self.ratings_df['date'] = pd.to_datetime(self.ratings_df['timestamp'], unit='s')
        
    # Filter recent ratings based on timeframe
    now = pd.Timestamp.now()
    if timeframe == 'week':
        recent_ratings = self.ratings_df[self.ratings_df['date'] > (now - pd.Timedelta(days=7))]
    elif timeframe == 'month':
        recent_ratings = self.ratings_df[self.ratings_df['date'] > (now - pd.Timedelta(days=30))]
    elif timeframe == 'year':
        recent_ratings = self.ratings_df[self.ratings_df['date'] > (now - pd.Timedelta(days=365))]
    else:
        recent_ratings = self.ratings_df
        
    # Calculate popularity score (avg rating * number of ratings)
    popularity = recent_ratings.groupby('movie_id').agg(
        avg_rating=('rating', 'mean'),
        num_ratings=('rating', 'count')
    ).reset_index()
        
    # Calculate trending score
    popularity['trending_score'] = popularity['avg_rating'] * np.log1p(popularity['num_ratings'])
        
    # Get top trending movies
    top_trending = popularity.sort_values('trending_score', ascending=False).head(n)
        
    # Get movie details
    trending_recs = self.movies_df[self.movies_df['movie_id'].isin(top_trending['movie_id'])].copy()
        
    # Add trending score
    trending_recs = trending_recs.merge(
        top_trending[['movie_id', 'trending_score', 'avg_rating', 'num_ratings']], 
        on='movie_id'
    )
        
    return trending_recs.sort_values('trending_score', ascending=False)
    
def get_personalized_for_you(self, user_id, n=10):
    """
    Get personalized 'For You' recommendations based on user watch history
        
    Parameters:
    user_id: ID of the user to get recommendations for
    n (int): Number of recommendations to return
        
    Returns:
    DataFrame: Top n personalized recommendations
    """
    # This is a more personalized version focusing on user's specific tastes
    return self.get_hybrid_recommendations(user_id, content_weight=0.6, n=n)

In [68]:
def get_because_you_watched(self, movie_id, n=10):
    """
    Get recommendations based on a specific movie (Netflix's "Because you watched X")
        
    Parameters:
    movie_id: ID of the movie to base recommendations on
    n (int): Number of recommendations to return
        
    Returns:
    DataFrame: Top n recommendations
    """
    # This is simply content-based filtering using a specific movie
    return self.get_content_based_recommendations(movie_id, n=n)

In [70]:
def analyze_user_preferences(self, user_id):
    """
    Analyze user preferences to understand their tastes
        
    Parameters:
    user_id: ID of the user to analyze
        
    Returns:
    dict: User preference analysis
    """
    if self.ratings_df is None:
        raise ValueError("Ratings data not loaded")
            
    user_ratings = self.ratings_df[self.ratings_df['user_id'] == user_id]
    if len(user_ratings) == 0:
        raise ValueError(f"No ratings found for user ID {user_id}")
            
    # Get movies the user has rated
    rated_movies = self.movies_df[self.movies_df['movie_id'].isin(user_ratings['movie_id'])]
        
    # Merge ratings with movie details
    user_data = rated_movies.merge(user_ratings[['movie_id', 'rating']], on='movie_id')
        
    # Calculate genre preferences if genre data is available
    genre_prefs = {}
    if 'genres' in rated_movies.columns:
        # Assuming genres are stored as comma-separated strings
        all_genres = []
        for genres in rated_movies['genres']:
            all_genres.extend([g.strip() for g in str(genres).split(',')])
            
        unique_genres = list(set(all_genres))
            
        for genre in unique_genres:
            # Calculate average rating for each genre
            genre_movies = user_data[user_data['genres'].str.contains(genre, na=False)]
            if len(genre_movies) > 0:
                avg_rating = genre_movies['rating'].mean()
                genre_prefs[genre] = {
                    'avg_rating': avg_rating,
                    'movies_count': len(genre_movies)
                }
        
        # Calculate most watched timeframes if timestamp data is available
    time_prefs = {}
    if 'timestamp' in user_ratings.columns:
        user_ratings['date'] = pd.to_datetime(user_ratings['timestamp'], unit='s')
        user_ratings['hour'] = user_ratings['date'].dt.hour
        user_ratings['day_of_week'] = user_ratings['date'].dt.day_name()
            
        # Most active hours
        hour_counts = user_ratings['hour'].value_counts()
        peak_hours = hour_counts.nlargest(3).index.tolist()
            
        # Most active days
        day_counts = user_ratings['day_of_week'].value_counts()
        peak_days = day_counts.nlargest(3).index.tolist()
            
        time_prefs = {
            'peak_hours': peak_hours,
            'peak_days': peak_days
        }
        
    # Overall stats
    overall_stats = {
        'total_ratings': len(user_ratings),
        'avg_rating': user_ratings['rating'].mean(),
        'top_rated': rated_movies.loc[user_ratings['rating'].idxmax()]['title'] 
                    if len(user_ratings) > 0 else None,
        'recently_watched': rated_movies.loc[user_ratings['timestamp'].idxmax()]['title'] 
                             if 'timestamp' in user_ratings.columns and len(user_ratings) > 0 else None
        }
        
    return {
        'overall_stats': overall_stats,
        'genre_preferences': genre_prefs,
        'time_preferences': time_prefs
    }

In [72]:
def visualize_user_preferences(self, user_id):
    """
    Visualize user preferences
        
    Parameters:
    user_id: ID of the user to visualize preferences for
    """
    if self.ratings_df is None:
        raise ValueError("Ratings data not loaded")
            
    user_ratings = self.ratings_df[self.ratings_df['user_id'] == user_id]
    if len(user_ratings) == 0:
        raise ValueError(f"No ratings found for user ID {user_id}")
            
     # Get movies the user has rated
    rated_movies = self.movies_df[self.movies_df['movie_id'].isin(user_ratings['movie_id'])]
        
    # Merge ratings with movie details
    user_data = rated_movies.merge(user_ratings[['movie_id', 'rating']], on='movie_id')
        
    plt.figure(figsize=(15, 10))
        
    # Rating distribution
    plt.subplot(2, 2, 1)
    sns.histplot(user_data['rating'], bins=10, kde=True)
    plt.title(f'Rating Distribution for User {user_id}')
    plt.xlabel('Rating')
    plt.ylabel('Count')
        
    # Genre preferences if available
    if 'genres' in rated_movies.columns:
        plt.subplot(2, 2, 2)
            
        # Process genres
        genre_data = []
        for idx, row in user_data.iterrows():
            genres = str(row['genres']).split(',')
            for genre in genres:
                genre = genre.strip()
                if genre:
                    genre_data.append({
                        'genre': genre,
                        'rating': row['rating']
                    })
            
        genre_df = pd.DataFrame(genre_data)
        genre_avg = genre_df.groupby('genre')['rating'].mean().sort_values(ascending=False)
            
        # Plot top genres
        top_genres = genre_avg.head(10)
        sns.barplot(x=top_genres.values, y=top_genres.index)
        plt.title(f'Top Genres by Average Rating for User {user_id}')
        plt.xlabel('Average Rating')
    if 'timestamp' in user_ratings.columns:
        plt.subplot(2, 2, 3)
        user_ratings['date'] = pd.to_datetime(user_ratings['timestamp'], unit='s')
        user_ratings = user_ratings.sort_values('date')
        sns.lineplot(x=user_ratings['date'], y=user_ratings['rating'])
        plt.title(f'Rating Trend Over Time for User {user_id}')
        plt.xlabel('Date')
        plt.ylabel('Rating')
            
        # Watch activity heatmap by hour and day
        plt.subplot(2, 2, 4)
        user_ratings['hour'] = user_ratings['date'].dt.hour
        user_ratings['day'] = user_ratings['date'].dt.dayofweek
            
        watch_pivot = pd.pivot_table(
            user_ratings,
            values='movie_id',
            index='day',
            columns='hour',
            aggfunc='count',
            fill_value=0
        )
        sns.heatmap(watch_pivot, cmap='YlGnBu')
        plt.title(f'Watch Activity Heatmap for User {user_id}')
        plt.xlabel('Hour of Day')
        plt.ylabel('Day of Week (0=Monday)')
        
    plt.tight_layout()
    plt.show()

In [82]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import re
import datetime
import random

class IFFADataScraper:
    def __init__(self, base_url="https://www.iffa.com.au"):
        self.base_url = base_url
        self.movies_data = []

    def scrape_movies(self):
        """Scrape movie data from IFFA Australia website"""
        try:
            # Send request to the main page
            response = requests.get(self.base_url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find movie links - this is a placeholder, you'll need to inspect the actual website
            movie_links = soup.select('a.movie-link')  # Adjust selector based on actual HTML structure
            
            for link in movie_links:
                movie_url = link.get('href')
                if not movie_url.startswith('http'):
                    movie_url = self.base_url + movie_url
                
                # Scrape individual movie page
                self._scrape_movie_page(movie_url)
            
            # Convert to DataFrame
            movies_df = pd.DataFrame(self.movies_data)
            return movies_df
            
        except requests.exceptions.RequestException as e:
            print(f"Error scraping IFFA website: {e}")
            return pd.DataFrame()

    def _scrape_movie_page(self, url):
        """Scrape data from an individual movie page"""
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract movie details - adjust selectors based on actual HTML structure
            movie_data = {
                'movie_id': self._generate_movie_id(url),
                'title': self._extract_text(soup, 'h1.movie-title'),
                'genres': self._extract_text(soup, 'div.genres'),
                'director': self._extract_text(soup, 'div.director'),
                'cast': self._extract_text(soup, 'div.cast'),
                'description': self._extract_text(soup, 'div.description'),
                'release_year': self._extract_text(soup, 'div.year'),
                'duration': self._extract_text(soup, 'div.duration'),
                'url': url,
                'image_url': self._extract_attribute(soup, 'img.movie-poster', 'src')
            }
            
            self.movies_data.append(movie_data)
            
        except requests.exceptions.RequestException as e:
            print(f"Error scraping movie page {url}: {e}")

    def _extract_text(self, soup, selector):
        """Extract text from a BeautifulSoup element"""
        element = soup.select_one(selector)
        return element.text.strip() if element else ''

    def _extract_attribute(self, soup, selector, attribute):
        """Extract an attribute from a BeautifulSoup element"""
        element = soup.select_one(selector)
        return element.get(attribute, '') if element else ''

    def _generate_movie_id(self, url):
        """Generate a unique movie ID from URL"""
        # Extract ID from URL if possible
        match = re.search(r'movie[/=](\d+)', url)
        if match:
            return int(match.group(1))
        
        # Otherwise hash the URL
        return abs(hash(url)) % 10000000

    def generate_synthetic_data(self, num_movies=100, num_users=500, num_ratings=5000):
        """
        Generate synthetic data for testing when web scraping isn't possible
        
        Parameters:
        num_movies (int): Number of movies to generate
        num_users (int): Number of users to generate
        num_ratings (int): Number of ratings to generate
        
        Returns:
        tuple: (movies_df, users_df, ratings_df)
        """
        print(f"Generating synthetic data with {num_movies} movies, {num_users} users, and {num_ratings} ratings")
        
        # Define movie genres for IFFA (film festival context)
        genres = [
            "Drama", "Comedy", "Documentary", "Short Film", "Animation", 
            "Thriller", "Horror", "Experimental", "Romance", "Action",
            "Science Fiction", "Fantasy", "Adventure", "Mystery", "Historical"
        ]
        
        # Define directors and cast members (placeholder names)
        directors = [
            "Jane Smith", "John Doe", "Emma Johnson", "Michael Brown", "Sarah Davis",
            "David Wilson", "Lisa Anderson", "Robert Taylor", "Jennifer Martinez", "James Thompson"
        ]
        
        cast_members = [
            "Emma Stone", "Chris Hemsworth", "Cate Blanchett", "Hugh Jackman", "Nicole Kidman",
            "Sam Worthington", "Margot Robbie", "Russell Crowe", "Naomi Watts", "Eric Bana",
            "Toni Collette", "Joel Edgerton", "Rose Byrne", "Guy Pearce", "Isla Fisher",
            "Jason Clarke", "Rebel Wilson", "Ben Mendelsohn", "Mia Wasikowska", "Simon Baker"
        ]
        
        # Generate movie data
        movies_data = []
        for i in range(1, num_movies + 1):
            # Randomly select 1-3 genres
            num_genres = np.random.randint(1, 4)
            movie_genres = ", ".join(np.random.choice(genres, num_genres, replace=False))
            
            # Randomly select director
            director = np.random.choice(directors)
            
            # Randomly select 2-5 cast members
            num_cast = np.random.randint(2, 6)
            movie_cast = ", ".join(np.random.choice(cast_members, num_cast, replace=False))
            
            # Generate random year between 2010 and 2024
            year = np.random.randint(2010, 2025)
            
            # Generate a random duration between 60 and 180 minutes
            duration = np.random.randint(60, 181)
            
            # Create a movie description
            descriptions = [
                f"A compelling {movie_genres.split(',')[0].strip().lower()} film that explores the complexities of human relationships.",
                f"An award-winning {movie_genres.split(',')[0].strip().lower()} masterpiece that captivates audiences with its stunning visuals.",
                f"A thought-provoking journey through {movie_genres.split(',')[0].strip().lower()} themes that challenges conventional thinking.",
                f"A groundbreaking {movie_genres.split(',')[0].strip().lower()} narrative that showcases Australian filmmaking at its finest.",
                f"An emotionally charged {movie_genres.split(',')[0].strip().lower()} experience that resonates with audiences long after viewing."
            ]
            description = np.random.choice(descriptions)
            
            # Generate a movie title
            title_adjectives = ["Lost", "Hidden", "Eternal", "Broken", "Silent", "Wild", "Golden", "Distant", "Secret", "Forgotten"]
            title_nouns = ["Dreams", "Light", "Journey", "Sunset", "Waters", "Echo", "Whisper", "Horizon", "Shadow", "Promise"]
            
            title = f"The {np.random.choice(title_adjectives)} {np.random.choice(title_nouns)}"
            
            movies_data.append({
                'movie_id': i,
                'title': title,
                'genres': movie_genres,
                'director': director,
                'cast': movie_cast,
                'description': description,
                'release_year': year,
                'duration': duration,
                'url': f"https://iffa.com.au/movies/{i}",
                'image_url': f"https://iffa.com.au/images/movies/{i}.jpg",
                'keywords': movie_genres.replace(", ", ","),  # For content-based filtering
                'tags': movie_genres.replace(", ", ",")  # For content-based filtering
            })
        
        # Create movies DataFrame
        movies_df = pd.DataFrame(movies_data)
        
        # Generate user data
        users_data = []
        for i in range(1, num_users + 1):
            # Generate random age between 18 and 80
            age = np.random.randint(18, 81)
            
            # Generate random gender
            gender = np.random.choice(['M', 'F', 'Other'])
            
            # Generate random location
            locations = ["Sydney", "Melbourne", "Brisbane", "Perth", "Adelaide", 
                         "Canberra", "Hobart", "Darwin", "Gold Coast", "Newcastle"]
            location = np.random.choice(locations)
            
            # Generate random favorite genres (1-3)
            num_fav_genres = np.random.randint(1, 4)
            favorite_genres = ", ".join(np.random.choice(genres, num_fav_genres, replace=False))
            
            # Generate registration date
            # Random date between Jan 1, 2020 and today
            start_date = datetime.datetime(2020, 1, 1).timestamp()
            end_date = datetime.datetime.now().timestamp()
            reg_timestamp = np.random.randint(start_date, end_date)
            reg_date = datetime.datetime.fromtimestamp(reg_timestamp).strftime('%Y-%m-%d')
            
            # Generate subscription type
            sub_types = ["Free", "Basic", "Premium", "Festival Pass"]
            subscription = np.random.choice(sub_types)
            
            users_data.append({
                'user_id': i,
                'age': age,
                'gender': gender,
                'location': location,
                'favorite_genres': favorite_genres,
                'registration_date': reg_date,
                'subscription_type': subscription
            })
        
        # Create users DataFrame
        users_df = pd.DataFrame(users_data)
        
        # Generate ratings data
        ratings_data = []
        
        # Start timestamp - January 1, 2020
        start_timestamp = datetime.datetime(2020, 1, 1).timestamp()
        
        # End timestamp - current time
        end_timestamp = datetime.datetime.now().timestamp()
        
        # Generate random user-movie ratings
        for _ in range(num_ratings):
            user_id = np.random.randint(1, num_users + 1)
            movie_id = np.random.randint(1, num_movies + 1)
            
            # Generate rating between 1 and 5, with 0.5 increments
            rating = np.random.choice([1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
            
            # Generate random timestamp for the rating
            timestamp = np.random.randint(start_timestamp, end_timestamp)
            
            ratings_data.append({
                'user_id': user_id,
                'movie_id': movie_id,
                'rating': rating,
                'timestamp': timestamp
            })
        
        # Create ratings DataFrame
        ratings_df = pd.DataFrame(ratings_data)
        
        # Remove duplicate user-movie pairs and keep only the most recent rating
        ratings_df = ratings_df.sort_values('timestamp', ascending=False).drop_duplicates(['user_id', 'movie_id']).reset_index(drop=True)
        
        print(f"Generated {len(movies_df)} movies, {len(users_df)} users, and {len(ratings_df)} ratings")
        
        return movies_df, users_df, ratings_df
    
    def generate_sample_data_files(self, output_dir='.'):
        """
        Generate and save sample data files for testing
        
        Parameters:
        output_dir (str): Directory to save the generated files
        """
        movies_df, users_df, ratings_df = self.generate_synthetic_data()
        
        # Save to CSV files
        movies_df.to_csv(f"{output_dir}/iffa_movies.csv", index=False)
        users_df.to_csv(f"{output_dir}/iffa_users.csv", index=False)
        ratings_df.to_csv(f"{output_dir}/iffa_ratings.csv", index=False)
        
        print(f"Sample data files saved to {output_dir}")


# Example usage
if __name__ == "__main__":
    scraper = IFFADataScraper()
    scraper.generate_sample_data_files()

Generating synthetic data with 100 movies, 500 users, and 5000 ratings
Generated 100 movies, 500 users, and 4753 ratings
Sample data files saved to .


In [86]:
def demo_recommender():
    # Create recommender system
    recommender = IFFARecommendationSystem()
    
    # Load data (example paths - update with your actual data)
    recommender.load_data(
        movies_path='iffa_movies.csv',
        ratings_path='iffa_ratings.csv',
        users_path='iffa_users.csv'
    )
    
    # Build recommendation models
    recommender.build_content_based_model()
    recommender.build_collaborative_filtering_model()
    
    # Get different types of recommendations
    
    # 1. Content-based recommendations for a movie
    movie_id = 12345  # Replace with actual movie ID
    content_recs = recommender.get_content_based_recommendations(movie_id)
    print("\nContent-based recommendations:")
    print(content_recs[['title', 'similarity_score']])
    
    # 2. Collaborative filtering recommendations for a user
    user_id = 42  # Replace with actual user ID
    collab_recs = recommender.get_collaborative_recommendations(user_id)
    print("\nCollaborative filtering recommendations:")
    print(collab_recs[['title', 'predicted_rating']])
    
    # 3. Hybrid recommendations
    hybrid_recs = recommender.get_hybrid_recommendations(user_id)
    print("\nHybrid recommendations:")
    print(hybrid_recs[['title', 'hybrid_score']])
    
    # 4. Trending recommendations
    trending_recs = recommender.get_trending_recommendations(timeframe='week')
    print("\nTrending this week:")
    print(trending_recs[['title', 'trending_score', 'avg_rating', 'num_ratings']])
    
    # 5. "For You" personalized recommendations
    for_you_recs = recommender.get_personalized_for_you(user_id)
    print("\nFor You recommendations:")
    print(for_you_recs[['title', 'hybrid_score']])
    
    # 6. "Because you watched" recommendations
    watched_movie_id = 67890  # Replace with actual movie ID
    because_you_watched = recommender.get_because_you_watched(watched_movie_id)
    print("\nBecause you watched recommendations:")
    print(because_you_watched[['title', 'similarity_score']])
    
    # 7. Analyze user preferences
    user_prefs = recommender.analyze_user_preferences(user_id)
    print("\nUser preferences analysis:")
    print(user_prefs)
    
    # 8. Visualize user preferences
    recommender.visualize_user_preferences(user_id)


if __name__ == "__main__":
    demo_recommender()

Loaded 100 movies
Loaded 4753 ratings
Loaded 500 users


AttributeError: 'IFFARecommendationSystem' object has no attribute 'build_content_based_model'