In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("Movies.csv")

# Keep only required columns
df = df[["movie_id", "title", "genres", "vote_average"]]

# Clean genres column
df["genres"] = df["genres"].str.lower()


In [2]:
df.head()

Unnamed: 0,movie_id,title,genres,vote_average
0,4520010,Inception,"action, science fiction, adventure",8.364
1,4520011,Interstellar,"adventure, drama, science fiction",8.417
2,4520012,The Dark Knight,"drama, action, crime, thriller",8.512
3,4520013,Avatar,"action, adventure, fantasy, science fiction",7.573
4,4520014,The Avengers,"science fiction, action, adventure",7.71


In [3]:
df.tail()

Unnamed: 0,movie_id,title,genres,vote_average
1048570,5568580,Master of Demon Subjugation,,0.0
1048571,5568581,Storybook Classics: Ivanhoe,,0.0
1048572,5568582,Screen Flicker,,0.0
1048573,5568583,Chamber of Whores 2: Pornworld,,0.0
1048574,5568584,FREEDOMS 9th Anniversary Memorial Conference,,0.0


In [4]:
df.dropna(axis=0,inplace=True)

In [5]:
df

Unnamed: 0,movie_id,title,genres,vote_average
0,4520010,Inception,"action, science fiction, adventure",8.364
1,4520011,Interstellar,"adventure, drama, science fiction",8.417
2,4520012,The Dark Knight,"drama, action, crime, thriller",8.512
3,4520013,Avatar,"action, adventure, fantasy, science fiction",7.573
4,4520014,The Avengers,"science fiction, action, adventure",7.710
...,...,...,...,...
1048559,5568569,Snowy Northland,drama,0.000
1048560,5568570,Little Steven: At Rockpalast,music,0.000
1048564,5568574,Knorkator: Zu Alt,music,0.000
1048565,5568575,video chat party,documentary,0.000


In [6]:
import pandas as pd
import requests
from IPython.display import Image, display

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv("Movies.csv")

df["title"] = df["title"].str.lower().str.strip()
df["genres"] = df["genres"].fillna("").str.lower().str.strip()
df["vote_average"] = df["vote_average"].astype(float)

# ===============================
# TMDB CONFIG
# ===============================
TMDB_API_KEY = "6a9a5e8b756af2981b8623d6b2a60ee1"
TMDB_SEARCH_URL = "https://api.themoviedb.org/3/search/movie"
IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500"

# ===============================
# FETCH POSTER
# ===============================
def fetch_poster(title):
    try:
        r = requests.get(
            TMDB_SEARCH_URL,
            params={"api_key": TMDB_API_KEY, "query": title},
            timeout=5
        )
        data = r.json()
        if data.get("results"):
            poster = data["results"][0].get("poster_path")
            if poster:
                return IMAGE_BASE_URL + poster
    except:
        pass
    return None

# ===============================
# STRICT GENRE FILTER (AND)
# ===============================
def filter_by_genres_strict(user_input):
    user_genres = {
        g.strip()
        for g in user_input.replace("and", ",").split(",")
        if g.strip()
    }

    filtered = df[
        df["genres"].apply(lambda g: all(ug in g for ug in user_genres))
    ]
    return filtered, user_genres

# ===============================
# RELEVANCE CALCULATION
# ===============================
def compute_relevance(row, user_genres):
    movie_genres = {g.strip() for g in row["genres"].split(",")}
    matched = len(user_genres & movie_genres)

    # Intent satisfaction (most important)
    intent_score = matched / len(user_genres)

    # Genre focus (penalizes over-mixed genres)
    coverage_score = matched / len(movie_genres)

    # Weighted relevance
    relevance = 0.7 * intent_score + 0.3 * coverage_score
    return relevance

# ===============================
# DISPLAY RESULTS
# ===============================
def show_results(results, user_input, user_genres):
    print(f"\nTop 5 movies recommended for: {user_input}\n")

    relevance_scores = []

    for i, row in enumerate(results.itertuples(), start=1):
        relevance = compute_relevance(row._asdict(), user_genres)
        relevance_scores.append(relevance * 100)

        final_score = (
            0.75 * relevance +
            0.25 * (row.vote_average / 10)
        ) * 100

        print(f"{i}. {row.title.title()}")
        print(f"   Genres: {row.genres}")
        print(f"   Rating: {row.vote_average}/10")
        print(f"   Relevance Score: {int(relevance * 100)}%")
        print(f"   Final Score    : {round(final_score, 1)}%")

        poster = fetch_poster(row.title)
        if poster:
            display(Image(url=poster, width=180))

    return relevance_scores

# ===============================
# EVALUATION METRICS
# ===============================
def evaluate(results, relevance_scores):
    precision_at_5 = len(results) / 5
    avg_relevance = sum(relevance_scores) / len(relevance_scores)
    mean_rating = results["vote_average"].mean()

    print("\nEvaluation Metrics")
    print("------------------")
    print(f"Precision@5        : {precision_at_5:.2f}")
    print(f"Avg Relevance (%)  : {avg_relevance:.2f}")
    print(f"Mean Rating        : {mean_rating:.2f}/10")

# ===============================
# MAIN RECOMMENDER (AUTO-DETECT)
# ===============================
def recommend(user_input, top_n=5):
    user_input = user_input.lower().strip()

    # Case 1: movie title
    if user_input in df["title"].values:
        base_genres = df[df["title"] == user_input]["genres"].values[0]
        filtered, user_genres = filter_by_genres_strict(base_genres)
        display_input = f"{user_input.title()} (based on its genres)"
    else:
        # Case 2: genre input
        filtered, user_genres = filter_by_genres_strict(user_input)
        display_input = user_input

    if filtered.empty:
        print("No movies found.")
        return

    # Compute relevance & final score
    filtered = filtered.copy()
    filtered["relevance"] = filtered.apply(
        lambda row: compute_relevance(row, user_genres),
        axis=1
    )

    filtered["final_score"] = (
        0.75 * filtered["relevance"] +
        0.25 * (filtered["vote_average"] / 10)
    )

    results = filtered.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)

    relevance_scores = show_results(results, display_input, user_genres)
    evaluate(results, relevance_scores)


In [7]:
recommend("inception")


Top 5 movies recommended for: Inception (based on its genres)

1. Blood And Steel
   Genres: science fiction, action, adventure
   Rating: 10.0/10
   Relevance Score: 100%
   Final Score    : 100.0%


2. The Traveler
   Genres: action, adventure, science fiction
   Rating: 10.0/10
   Relevance Score: 100%
   Final Score    : 100.0%
3. Coal Miner'S Adventure
   Genres: action, adventure, science fiction
   Rating: 10.0/10
   Relevance Score: 100%
   Final Score    : 100.0%
4. Doctor Who: The Christmas Invasion
   Genres: science fiction, adventure, action
   Rating: 10.0/10
   Relevance Score: 100%
   Final Score    : 100.0%
5. The Gunner
   Genres: action, adventure, science fiction
   Rating: 10.0/10
   Relevance Score: 100%
   Final Score    : 100.0%



Evaluation Metrics
------------------
Precision@5        : 1.00
Avg Relevance (%)  : 100.00
Mean Rating        : 10.00/10


In [8]:
import pandas as pd
import requests
import pickle
import os
from IPython.display import Image, display

# ===============================
# LOAD DATA (CSV → PKL CACHE)
# ===============================
PKL_FILE = "Movies.pkl"
CSV_FILE = "Movies.csv"

if os.path.exists(PKL_FILE):
    with open(PKL_FILE, "rb") as f:
        df = pickle.load(f)
else:
    df = pd.read_csv(CSV_FILE)

    df["title"] = df["title"].str.lower().str.strip()
    df["genres"] = df["genres"].fillna("").str.lower().str.strip()
    df["vote_average"] = df["vote_average"].astype(float)

    with open(PKL_FILE, "wb") as f:
        pickle.dump(df, f)

# ===============================
# TMDB CONFIG
# ===============================
TMDB_API_KEY = "6a9a5e8b756af2981b8623d6b2a60ee1"
TMDB_SEARCH_URL = "https://api.themoviedb.org/3/search/movie"
IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500"

# ===============================
# FETCH POSTER
# ===============================
def fetch_poster(title):
    try:
        r = requests.get(
            TMDB_SEARCH_URL,
            params={"api_key": TMDB_API_KEY, "query": title},
            timeout=5
        )
        data = r.json()
        if data.get("results"):
            poster = data["results"][0].get("poster_path")
            if poster:
                return IMAGE_BASE_URL + poster
    except:
        pass
    return None

# ===============================
# STRICT GENRE FILTER (AND)
# ===============================
def filter_by_genres_strict(user_input):
    user_genres = {
        g.strip()
        for g in user_input.replace("and", ",").split(",")
        if g.strip()
    }

    filtered = df[
        df["genres"].apply(lambda g: all(ug in g for ug in user_genres))
    ]
    return filtered, user_genres

# ===============================
# RELEVANCE CALCULATION
# ===============================
def compute_relevance(row, user_genres):
    movie_genres = {g.strip() for g in row["genres"].split(",")}
    matched = len(user_genres & movie_genres)

    intent_score = matched / len(user_genres)
    coverage_score = matched / len(movie_genres)

    relevance = 0.7 * intent_score + 0.3 * coverage_score
    return relevance

# ===============================
# DISPLAY RESULTS
# ===============================
def show_results(results, user_input, user_genres):
    print(f"\nTop 5 movies recommended for: {user_input}\n")

    relevance_scores = []

    for i, row in enumerate(results.itertuples(), start=1):
        relevance = compute_relevance(row._asdict(), user_genres)
        relevance_scores.append(relevance * 100)

        final_score = (
            0.75 * relevance +
            0.25 * (row.vote_average / 10)
        ) * 100

        print(f"{i}. {row.title.title()}")
        print(f"   Genres: {row.genres}")
        print(f"   Rating: {row.vote_average}/10")
        print(f"   Relevance Score: {int(relevance * 100)}%")
        print(f"   Final Score    : {round(final_score, 1)}%")

        poster = fetch_poster(row.title)
        if poster:
            display(Image(url=poster, width=180))

    return relevance_scores

# ===============================
# EVALUATION METRICS
# ===============================
def evaluate(results, relevance_scores):
    precision_at_5 = len(results) / 5
    avg_relevance = sum(relevance_scores) / len(relevance_scores)
    mean_rating = results["vote_average"].mean()

    print("\nEvaluation Metrics")
    print("------------------")
    print(f"Precision@5        : {precision_at_5:.2f}")
    print(f"Avg Relevance (%)  : {avg_relevance:.2f}")
    print(f"Mean Rating        : {mean_rating:.2f}/10")

# ===============================
# MAIN RECOMMENDER (AUTO-DETECT)
# ===============================
def recommend(user_input, top_n=5):
    user_input = user_input.lower().strip()

    if user_input in df["title"].values:
        base_genres = df[df["title"] == user_input]["genres"].values[0]
        filtered, user_genres = filter_by_genres_strict(base_genres)
        display_input = f"{user_input.title()} (based on its genres)"
    else:
        filtered, user_genres = filter_by_genres_strict(user_input)
        display_input = user_input

    if filtered.empty:
        print("No movies found.")
        return

    filtered = filtered.copy()
    filtered["relevance"] = filtered.apply(
        lambda row: compute_relevance(row, user_genres),
        axis=1
    )

    filtered["final_score"] = (
        0.75 * filtered["relevance"] +
        0.25 * (filtered["vote_average"] / 10)
    )

    results = filtered.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)

    relevance_scores = show_results(results, display_input, user_genres)
    evaluate(results, relevance_scores)


In [28]:
import pandas as pd
import requests
import pickle
import os
from IPython.display import Image, display

# ===============================
# LOAD DATA (CSV → PKL CACHE)
# ===============================
PKL_FILE = "Movies.pkl"
CSV_FILE = "Movies.csv"

if os.path.exists(PKL_FILE):
    with open(PKL_FILE, "rb") as f:
        df = pickle.load(f)
else:
    df = pd.read_csv(CSV_FILE)

    df["title"] = df["title"].str.lower().str.strip()
    df["genres"] = df["genres"].fillna("").str.lower().str.strip()
    df["vote_average"] = df["vote_average"].astype(float)

    with open(PKL_FILE, "wb") as f:
        pickle.dump(df, f)

# ===============================
# TMDB CONFIG
# ===============================
TMDB_API_KEY = "6a9a5e8b756af2981b8623d6b2a60ee1"
TMDB_SEARCH_URL = "https://api.themoviedb.org/3/search/movie"
IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500"

# ===============================
# FETCH POSTER
# ===============================
def fetch_poster(title):
    try:
        r = requests.get(
            TMDB_SEARCH_URL,
            params={"api_key": TMDB_API_KEY, "query": title},
            timeout=5
        )
        data = r.json()
        if data.get("results"):
            poster = data["results"][0].get("poster_path")
            if poster:
                return IMAGE_BASE_URL + poster
    except:
        pass
    return None

# ===============================
# STRICT GENRE FILTER (AND)
# ===============================
def filter_by_genres_strict(user_input):
    user_genres = {
        g.strip()
        for g in user_input.replace("and", ",").split(",")
        if g.strip()
    }

    filtered = df[
        df["genres"].apply(lambda g: all(ug in g for ug in user_genres))
    ]
    return filtered, user_genres

# ===============================
# RELEVANCE CALCULATION
# ===============================
def compute_relevance(row, user_genres):
    movie_genres = {g.strip() for g in row["genres"].split(",")}
    matched = len(user_genres & movie_genres)

    intent_score = matched / len(user_genres)
    coverage_score = matched / len(movie_genres)

    relevance = 0.7 * intent_score + 0.3 * coverage_score
    return relevance

# ===============================
# DISPLAY RESULTS
# ===============================
def show_results(results, user_input, user_genres):
    print(f"\nTop 5 movies recommended for: {user_input}\n")

    relevance_scores = []

    for i, row in enumerate(results.itertuples(), start=1):
        relevance = compute_relevance(row._asdict(), user_genres)
        relevance_scores.append(relevance * 100)

        final_score = (
            0.75 * relevance +
            0.25 * (row.vote_average / 10)
        ) * 100

        print(f"{i}. {row.title.title()}")
        print(f"   Genres: {row.genres}")
        print(f"   Rating: {row.vote_average}/10")
        print(f"   Relevance Score: {int(relevance * 100)}%")
        print(f"   Final Score    : {round(final_score, 1)}%")

        poster = fetch_poster(row.title)
        if poster:
            display(Image(url=poster, width=180))

    return relevance_scores

# ===============================
# EVALUATION METRICS
# ===============================
def evaluate(results, relevance_scores):
    precision_at_5 = len(results) / 5
    avg_relevance = sum(relevance_scores) / len(relevance_scores)
    mean_rating = results["vote_average"].mean()

    print("\nEvaluation Metrics")
    print("------------------")
    print(f"Precision@5        : {precision_at_5:.2f}")
    print(f"Avg Relevance (%)  : {avg_relevance:.2f}")
    print(f"Mean Rating        : {mean_rating:.2f}/10")

# ===============================
# MAIN RECOMMENDER (AUTO-DETECT)
# ===============================
def recommend(user_input, top_n=5):
    user_input = user_input.lower().strip()

    if user_input in df["title"].values:
        base_genres = df[df["title"] == user_input]["genres"].values[0]
        filtered, user_genres = filter_by_genres_strict(base_genres)
        display_input = f"{user_input.title()} (based on its genres)"
    else:
        filtered, user_genres = filter_by_genres_strict(user_input)
        display_input = user_input

    if filtered.empty:
        print("No movies found.")
        return

    filtered = filtered.copy()
    filtered["relevance"] = filtered.apply(
        lambda row: compute_relevance(row, user_genres),
        axis=1
    )

    filtered["final_score"] = (
        0.75 * filtered["relevance"] +
        0.25 * (filtered["vote_average"] / 10)
    )

    results = filtered.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)

    relevance_scores = show_results(results, display_input, user_genres)
    evaluate(results, relevance_scores)