In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import requests
import gradio as gr
import pandas as pd
import numpy as np
from collections import Counter
import time


In [4]:
def get_movie_id(movie_name):
    """Get movie ID from movie name with better matching"""
    url = f"{BASE_URL}/search/movie"
    params = {"api_key": API_KEY, "query": movie_name}
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data["results"]:
            # Try to find the best match
            for result in data["results"]:
                if result.get("title", "").lower() == movie_name.lower():
                    return result["id"], result.get("release_date", "")
            # Return the first result if no exact match
            return data["results"][0]["id"], data["results"][0].get("release_date", "")
        return None, None
    except Exception as e:
        print(f"Error getting movie ID: {e}")
        return None, None

In [5]:
def get_movie_details(movie_id):
    """Get detailed information about a movie"""
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {"api_key": API_KEY, "append_to_response": "keywords,credits,similar"}
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        # Extract relevant information
        details = {
            "id": movie_id,
            "title": data.get("title", ""),
            "overview": data.get("overview", ""),
            "genres": [genre["name"] for genre in data.get("genres", [])],
            "genre_ids": [genre["id"] for genre in data.get("genres", [])],
            "release_date": data.get("release_date", ""),
            "vote_average": data.get("vote_average", 0),
            "vote_count": data.get("vote_count", 0),
            "keywords": [kw["name"] for kw in data.get("keywords", {}).get("keywords", [])],
            "director": "",
            "cast": [],
            "similar_movies": []
        }

        # Get director
        for crew_member in data.get("credits", {}).get("crew", []):
            if crew_member.get("job") == "Director":
                details["director"] = crew_member.get("name", "")
                break

        # Get top 5 cast members
        for cast_member in data.get("credits", {}).get("cast", [])[:5]:
            details["cast"].append(cast_member.get("name", ""))

        # Get similar movies from API
        for movie in data.get("similar", {}).get("results", [])[:5]:
            details["similar_movies"].append({
                "id": movie.get("id"),
                "title": movie.get("title"),
                "vote_average": movie.get("vote_average", 0)
            })

        return details
    except Exception as e:
        print(f"Error getting movie details: {e}")
        return None

In [6]:
def get_movie_details(movie_id):
    """Get detailed information about a movie with more metadata"""
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {"api_key": API_KEY, "append_to_response": "keywords,credits,similar"}
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        # Extract relevant information
        details = {
            "id": movie_id,
            "title": data.get("title", ""),
            "overview": data.get("overview", ""),
            "genres": [genre["name"] for genre in data.get("genres", [])],
            "genre_ids": [genre["id"] for genre in data.get("genres", [])],
            "release_date": data.get("release_date", ""),
            "vote_average": data.get("vote_average", 0),
            "vote_count": data.get("vote_count", 0),
            "keywords": [kw["name"] for kw in data.get("keywords", {}).get("keywords", [])],
            "director": "",
            "cast": [],
            "production_companies": [company["name"] for company in data.get("production_companies", [])[:2]],
            "runtime": data.get("runtime", 0)
        }

        # Get director
        for crew_member in data.get("credits", {}).get("crew", []):
            if crew_member.get("job") == "Director":
                details["director"] = crew_member.get("name", "")
                break

        # Get top 3 cast members
        for cast_member in data.get("credits", {}).get("cast", [])[:3]:
            details["cast"].append(cast_member.get("name", ""))

        return details
    except Exception as e:
        print(f"Error getting movie details: {e}")
        return None

In [7]:
def create_powerful_soup(movie_details):
    """
    Creates a weighted text soup for a movie's details.
    This is the most important function for relevance.
    """
    soup_parts = []

    # 1. GENRES: Most important feature. Add it 3 times.
    if movie_details.get("genres"):
        soup_parts.extend(movie_details["genres"] * 3) # Weight x3

    # 2. DIRECTOR: Very important. Add it 3 times.
    director = movie_details.get("director", "")
    if director:
        soup_parts.extend([director] * 3) # Weight x3

    # 3. CAST: Important. Add each main cast member 2 times.
    for actor in movie_details.get("cast", [])[:3]: # Top 3 actors
        soup_parts.extend([actor] * 2) # Weight x2

    # 4. KEYWORDS: Less important. Add them once.
    soup_parts.extend(movie_details.get("keywords", []))

    # 5. OVERVIEW: Sometimes useful, but can be noisy. Add it once.
    overview = movie_details.get("overview", "")
    if overview:
        soup_parts.append(overview)

    # Combine all parts into one string
    return " ".join(soup_parts)

In [8]:
def get_content_based_recommendations(movie_details, num_recommendations=10):
    """Get recommendations based on content similarity using a powerful soup."""
    if not movie_details:
        return []

    # 1. Create a POWERFUL soup for the input movie
    query_soup = create_powerful_soup(movie_details)

    # 2. Get a larger, better candidate pool from TMDB
    release_year = movie_details["release_date"][:4] if movie_details.get("release_date") else "2020"
    url = f"{BASE_URL}/discover/movie"
    params = {
        "api_key": API_KEY,
        # Use genres to get a relevant pool
        "with_genres": ",".join(map(str, movie_details.get("genre_ids", []))),
        "sort_by": "popularity.desc", # Get popular movies first
        "vote_count.gte": 50, # Ensure some popularity
        "page": 1,
        # Widen the year filter a bit
        "primary_release_date.gte": f"{int(release_year) - 8}-01-01",
        "primary_release_date.lte": f"{int(release_year) + 8}-12-31"
    }

    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        candidate_movies = []
        # 3. We need FULL details for each candidate to build their soup!
        for movie in data.get("results", [])[:15]: # Limit to 15 candidates to avoid too many API calls
            if movie.get("id") != movie_details["id"]:
                # GET FULL DETAILS for the candidate movie
                candidate_id = movie.get("id")
                candidate_details = get_movie_details(candidate_id) # Use your existing function
                if candidate_details:
                    # Create a powerful soup for the candidate
                    candidate_soup = create_powerful_soup(candidate_details)
                    candidate_movies.append((candidate_details, candidate_soup))

        # 4. Calculate similarity using TF-IDF
        if candidate_movies:
            # Prepare all texts: query first, then all candidates
            all_soups = [query_soup] + [soup for _, soup in candidate_movies]

            vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
            tfidf_matrix = vectorizer.fit_transform(all_soups)

            # Compare the query (first row) to all candidates (the rest)
            cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

            # Pair each candidate movie with its similarity score
            scored_movies = [(candidate_movies[i][0], cosine_similarities[i]) for i in range(len(candidate_movies))]
            # Sort by score, highest first
            scored_movies.sort(key=lambda x: x[1], reverse=True)

            # Return the top N movies
            return [movie for movie, score in scored_movies[:num_recommendations]]

    except Exception as e:
        print(f"Error in content-based recommendations: {e}")

    return []

In [9]:
def get_movies_by_director(director_name, original_movie_id, num_results=5):
    """Get movies by the same director"""
    if not director_name:
        return []

    url = f"{BASE_URL}/search/person"
    params = {"api_key": API_KEY, "query": director_name}

    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data["results"]:
            director_id = data["results"][0]["id"]
            url = f"{BASE_URL}/person/{director_id}/movie_credits"
            response = requests.get(url, params={"api_key": API_KEY}, timeout=10)
            response.raise_for_status()
            movies_data = response.json()

            # Get movies where the person was director, sorted by popularity
            director_movies = []
            for movie in movies_data.get("crew", []):
                if (movie.get("job") == "Director" and
                    movie.get("id") != original_movie_id and
                    movie.get("vote_count", 0) > 100):
                    director_movies.append(movie)

            # Sort by vote count (popularity)
            director_movies.sort(key=lambda x: x.get("vote_count", 0), reverse=True)
            return director_movies[:num_results]
    except Exception as e:
        print(f"Error getting director movies: {e}")

    return []


In [10]:
def get_movies_by_cast(cast_members, original_movie_id, num_results=3):
    """Get movies by the same cast members"""
    if not cast_members:
        return []

    cast_movies = []

    for actor in cast_members[:2]:  # Limit to top 2 cast members
        url = f"{BASE_URL}/search/person"
        params = {"api_key": API_KEY, "query": actor}

        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()

            if data["results"]:
                actor_id = data["results"][0]["id"]
                url = f"{BASE_URL}/person/{actor_id}/movie_credits"
                response = requests.get(url, params={"api_key": API_KEY}, timeout=10)
                response.raise_for_status()
                movies_data = response.json()

                # Get movies where the person acted, sorted by popularity
                for movie in movies_data.get("cast", []):
                    if (movie.get("id") != original_movie_id and
                        movie.get("vote_count", 0) > 100 and
                        movie not in cast_movies):
                        cast_movies.append(movie)

        except Exception as e:
            print(f"Error getting cast movies for {actor}: {e}")

    # Sort by vote count and take top results
    cast_movies.sort(key=lambda x: x.get("vote_count", 0), reverse=True)
    return cast_movies[:num_results]


In [11]:

def get_movies_by_genres(genre_ids, release_year, original_movie_id, num_results=10):
    """Get movies by similar genres with release year filtering"""
    if not genre_ids:
        return []

    url = f"{BASE_URL}/discover/movie"
    params = {
        "api_key": API_KEY,
        "with_genres": ",".join(map(str, genre_ids)),
        "sort_by": "popularity.desc",
        "vote_count.gte": 100,
        "page": 1,
        "primary_release_date.gte": f"{int(release_year) - 5}-01-01",
        "primary_release_date.lte": f"{int(release_year) + 5}-12-31"
    }

    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        # Filter out the original movie
        results = []
        for movie in data.get("results", []):
            if movie.get("id") != original_movie_id:
                results.append(movie)

        return results[:num_results]
    except Exception as e:
        print(f"Error getting genre-based recommendations: {e}")
        return []

In [12]:
def hybrid_recommendation(movie_details, num_recommendations=10):
    """A simpler hybrid approach. Relies on the improved content-based function."""
    if not movie_details:
        return []

    # Get the best content-based matches
    recommendations = get_content_based_recommendations(movie_details, num_recommendations * 2)

    # Just return the top ones. The content-based function is now doing the heavy lifting.
    return recommendations[:num_recommendations]

In [13]:
def calculate_plot_similarity(movie_overview, candidate_overviews):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform([movie_overview] + candidate_overviews)
    cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    return cos_sim  # Similarity scores

In [14]:
def recommend_movies(movie_name):
    """Main function to get movie recommendations"""
    if not movie_name.strip():
        return "Please enter a movie name.", None

    print(f"Searching for '{movie_name}'...")

    movie_id, release_date = get_movie_id(movie_name)
    if not movie_id:
        print(f"Movie '{movie_name}' not found.")
        return f"Movie '{movie_name}' not found.", None

    print(f"Found '{movie_name}'. Getting details...")
    movie_details = get_movie_details(movie_id)
    if not movie_details:
        print(f"Could not get details for '{movie_name}'.")
        return f"Could not get details for '{movie_name}'.", None

    print(f"Finding similar movies based on content...")
    recommendations = hybrid_recommendation(movie_details, 10)

    if not recommendations:
        print("No similar movies found.")
        return "No similar movies found.", None

    # Format the recommendations
    movie_data = []
    for movie in recommendations:
        title = movie.get('title', 'Unknown Title')
        release_year = movie.get('release_date', '')[:4] if movie.get('release_date') else 'Unknown'
        rating = movie.get('vote_average', 0)
        overview = movie.get('overview', 'No description available.')
        poster_path = movie.get('poster_path')
        poster_url = f"{IMAGE_BASE_URL}{poster_path}" if poster_path else None

        movie_data.append({
            "Title": title,
            "Year": release_year,
            "Rating": rating,
            "Overview": overview,
            "Poster": poster_url
        })

    # Create a DataFrame for display
    df = pd.DataFrame(movie_data)

    # Create HTML content for display with posters (POSTER ON LEFT, DETAILS ON RIGHT)
    html_content = f"<h2 style='color: #2c3e50; margin-bottom: 20px;'>Movies similar to '{movie_details['title']}':</h2>"
    html_content += "<div style='display: flex; flex-direction: column; gap: 20px;'>"

    for _, movie in df.iterrows():
        html_content += "<div style='display: flex; gap: 20px; padding: 20px; background: white; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);'>"

        # LEFT SIDE: Poster image
        if movie['Poster']:
            html_content += f"""
            <div style='flex-shrink: 0;'>
                <img src='{movie['Poster']}'
                     style='width: 120px; height: 180px; object-fit: cover; border-radius: 8px; box-shadow: 0 4px 8px rgba(0,0,0,0.2);'
                     alt='Movie poster'>
            </div>
            """

        # RIGHT SIDE: Movie details
        html_content += """
        <div style='flex: 1;'>
            <h3 style='margin: 0 0 8px 0; color: #2c3e50; font-size: 1.4em;'>{}</h3>
            <p style='margin: 0 0 12px 0; color: #e74c3c; font-weight: bold; font-size: 1.1em;'>
                ⭐ Rating: {}/10
            </p>
            <p style='margin: 0; color: #555; line-height: 1.5; font-size: 0.95em;'>{}</p>
        </div>
        """.format(
            f"{movie['Title']} ({movie['Year']})",
            movie['Rating'],
            movie['Overview']
        )

        html_content += "</div>"

    html_content += "</div>"

    return html_content, df

In [15]:
# Create Gradio interface
with gr.Blocks(title="Movie Recommendation System", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎬 Movie Recommendation System")
    gr.Markdown("Enter a movie name to get recommendations for similar movies based on director, cast, genres, and TMDb's similarity algorithm.")

    with gr.Row():
        movie_input = gr.Textbox(
            label="Movie Name",
            placeholder="Enter a movie name...",
            scale=4
        )
        submit_btn = gr.Button("Get Recommendations", variant="primary")

    with gr.Tab("Visual Display"):
        html_output = gr.HTML()

    with gr.Tab("Table View"):
        table_output = gr.Dataframe(
            headers=["Title", "Year", "Rating", "Overview", "Poster"],
            datatype=["str", "str", "number", "str", "str"],
            interactive=False,
            wrap=True,
            column_widths=["20%", "10%", "10%", "40%", "20%"]
        )

    # Set up event handlers - SIMPLIFIED VERSION
    def update_all_outputs(movie_name):
        """This function will update both outputs from the single recommend_movies return"""
        html_output, df_output = recommend_movies(movie_name)
        return df_output, html_output

    # Connect the button to update both outputs at once
    submit_btn.click(
        fn=update_all_outputs,
        inputs=movie_input,
        outputs=[table_output, html_output]  # Note the order: first output goes to first component
    )

In [16]:
if __name__ == "__main__":
    demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://80c5c12ee2b7cc13bc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
