In [None]:
%pip install -q -r requirements.txt

In [2]:
import psycopg2
import pandas as pd
import random
from datetime import datetime
import os

class PostgresPipeline:
    def __init__(self, db_config):
        self.db_config = db_config
        self.conn = None
        self.cur = None
        self.connect()

    def connect(self):
        try:
            self.conn = psycopg2.connect(**self.db_config)
            self.conn.set_session(autocommit=True)
            self.cur = self.conn.cursor()
            print("Database connection established.")
        except psycopg2.Error as e:
            print(f"Error: Could not connect to the database. {e}")
            self.conn = None

    def close(self):
        if self.cur:
            self.cur.close()
        if self.conn:
            self.conn.close()
        print("Database connection closed.")

In [None]:
# Create instance of PostgresPipeline
if __name__ == "__main__":
    # Database connection details
    DB_CONFIG = {
        "host": "",
        "dbname": "",
        "user": "",
        "password": ""
    }
    
    # Create instance of PostgresPipeline
    pipeline = PostgresPipeline(DB_CONFIG)
    
    # Close the connection
    pipeline.close()
    
    # Check if the connection is closed
    print(pipeline.conn)
    
    # Check if the cursor is closed
    print(pipeline.cur)
    
    # Reconnect
    pipeline.connect()
    
    # Test the connection
    print(pipeline.conn)



Database connection established.
Database connection closed.
<connection object at 0x7f7a17e81e40; dsn: 'user=bkalejaiye password=xxx dbname=testdb host=127.0.0.1', closed: 1>
<cursor object at 0x7f7a5823cd60; closed: -1>
Database connection established.
<connection object at 0x7f7a17e82c00; dsn: 'user=bkalejaiye password=xxx dbname=testdb host=127.0.0.1', closed: 0>


In [4]:
# Fetch data with optional limit
def fetch_data(pipeline, table_name, limit=None):
    query = f"SELECT * FROM {table_name}"
    if limit:
        query += f" LIMIT {limit}"
    pipeline.cur.execute(query)
    data = pipeline.cur.fetchall()
    return data

def fetch_movies(pipeline, limit=None):
    return fetch_data(pipeline, "movies", limit)

def fetch_ratings(pipeline, limit=None):
    return fetch_data(pipeline, "ratings", limit)
  
def fetch_tags(pipeline, limit=None):
    return fetch_data(pipeline, "tags", limit)
  
def fetch_links(pipeline, limit=None):
    return fetch_data(pipeline, "links", limit)


In [None]:
movies = fetch_movies(pipeline, 5)
ratings = fetch_ratings(pipeline, 5)
tags = fetch_tags(pipeline, 5)
links = fetch_links(pipeline, 5)

print(movies)
print(ratings)
print(tags)
print(links)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class RecommendationEngine:
    """
    Generalized recommendation engine supporting content-based and collaborative filtering.
    """
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.dataframes = self.fetch_and_prepare_data()

    def fetch_and_prepare_data(self):
        """
        Fetch and prepare all necessary data as DataFrames.
        """
        data = {
            "movies": self.fetch_data("movies", ["movie_id", "title", "genres"]),
            "ratings": self.fetch_data("ratings", ["user_id", "movie_id", "rating", "timestamp"]),
            "tags": self.fetch_data("tags", ["user_id", "movie_id", "tag", "timestamp"]),
            "links": self.fetch_data("links", ["movie_id", "imdb_id", "tmdb_id"]),
        }
        return {key: pd.DataFrame(value, columns=columns) for key, (value, columns) in data.items()}

    def fetch_data(self, table_name, columns):
        """
        Fetch data from a specific table and return with column names.
        """
        query = f"SELECT {', '.join(columns)} FROM {table_name}"
        self.pipeline.cur.execute(query)
        data = self.pipeline.cur.fetchall()
        return data, columns
    
    def get_movies(self):
        """
        Get the movies DataFrame.
        """
        return self.dataframes["movies"]
    
    def get_ratings(self):
        """
        Get the ratings DataFrame.
        """
        return self.dataframes["ratings"]
    
    def get_tags(self):
        """
        Get the tags DataFrame.
        """
        return self.dataframes["tags"]
    
    def get_links(self):
        """
        Get the links DataFrame.
        """
        return self.dataframes["links"]
    
    def content_based_recommendation(engine, movie_id, top_n=10):
        """
        Optimized content-based recommendation engine.
        """
        
        engine = RecommendationEngine(pipeline)
        # Fetch only required data
        movies = engine.get_movies()
        try:
           # Find the index of the target movie
           idx = movies[movies["movie_id"] == movie_id].index[0]
        except IndexError:
           return [f"Movie ID {movie_id} not found in the database."]
    
        # Limit the TF-IDF to genres of interest
        genres_subset = movies["genres"].iloc[idx: idx + 500]  # Example: Look at 500 movies near the target
        tfidf = TfidfVectorizer(stop_words="english")
        tfidf_matrix = tfidf.fit_transform(genres_subset)

        # Compute cosine similarity only for the subset
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
        sim_scores = list(enumerate(cosine_sim[0]))  # Similarity for the target movie

        # Sort and fetch top recommendations
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        movies_indices = [i[0] for i in sim_scores[1:top_n + 1]]  # Exclude the target movie itself
        top_movies = movies.iloc[movies_indices]["title"].values
    
        return top_movies

    print(content_based_recommendation(1, 10))
    