In [3]:
# --- 1. Setup & Libraries ---
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import re # For cleaning text

print("Libraries imported.")

Libraries imported.


In [4]:

# --- 2. Load Data ---
DATASET_PATH = "A:/Internship Project/project/final_dataset.csv" # Make sure this path is correct!

try:
    data = pd.read_csv(DATASET_PATH)
    print(f"Dataset loaded successfully from {DATASET_PATH}")
    print(f"Dataset shape: {data.shape}")
except FileNotFoundError:
    print(f"Error: Dataset file not found at {DATASET_PATH}")
    print("Please ensure the file path is correct and the file exists.")
    exit()
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

Dataset loaded successfully from A:/Internship Project/project/final_dataset.csv
Dataset shape: (33600, 23)


In [5]:

# --- 3. Exploratory Data Analysis (EDA) ---
print("\n--- Basic Data Info ---")
data.info()

print("\n--- Missing Values ---")
print(data.isnull().sum())

print("\n--- Descriptive Statistics (Numerical) ---")
numeric_cols = data.select_dtypes(include=np.number).columns
print(data[numeric_cols].describe())

print("\n--- Example Data Rows ---")
print(data.head())



--- Basic Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33600 entries, 0 to 33599
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     33600 non-null  object 
 1   Title                  33600 non-null  object 
 2   Movie Link             33600 non-null  object 
 3   Year                   33600 non-null  int64  
 4   Duration               33379 non-null  object 
 5   MPA                    25624 non-null  object 
 6   Rating                 33462 non-null  float64
 7   Votes                  33462 non-null  object 
 8   budget                 11815 non-null  float64
 9   grossWorldWide         18222 non-null  float64
 10  gross_US_Canada        17571 non-null  float64
 11  opening_weekend_Gross  15523 non-null  float64
 12  directors              33241 non-null  object 
 13  writers                32024 non-null  object 
 14  stars                  33127 

In [6]:
# --- 4. Preprocessing ---
print("\n--- Preprocessing ---")

# Fill NaNs in key text columns
data['Title'] = data['Title'].fillna("")
data['genres'] = data['genres'].fillna("")
data['stars'] = data['stars'].fillna("")
data['directors'] = data['directors'].fillna("")
# data['writers'] = data['writers'].fillna("") # Optional: add writers if desired

# Fill NaNs in Rating (using median or 0, median might be better)
# Convert Rating to numeric, coerce errors to NaN
data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')
median_rating = data['Rating'].median()
data['Rating'] = data['Rating'].fillna(median_rating) # Fill NaN ratings with median
print(f"NaN values in 'Rating' filled with median value: {median_rating}")

# Clean and combine features for content-based filtering
def clean_text(text):
    text = str(text).lower() # Lowercase
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = text.strip()
    return text

data['Title_clean'] = data['Title'].apply(clean_text)
data['genres_clean'] = data['genres'].apply(clean_text)
data['stars_clean'] = data['stars'].apply(clean_text)
data['directors_clean'] = data['directors'].apply(clean_text)
# data['writers_clean'] = data['writers'].apply(clean_text) # Optional

# Combine features
data['combined_features'] = (data['Title_clean'] + ' ' +
                             data['genres_clean'] + ' ' +
                             data['stars_clean'] + ' ' +
                             data['directors_clean'])
                             # + ' ' + data['writers_clean'] # Optional

print("Combined features created: Title, Genres, Stars, Directors")



--- Preprocessing ---
NaN values in 'Rating' filled with median value: 6.3
Combined features created: Title, Genres, Stars, Directors


In [7]:
# --- 5. Content-Based Component ---
print("\n--- Building Content-Based Model ---")
tfidf = TfidfVectorizer(stop_words='english')
# Fit TF-IDF on the combined features
tfidf_matrix = tfidf.fit_transform(data['combined_features'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")




--- Building Content-Based Model ---
TF-IDF matrix shape: (33600, 57001)


In [8]:

# --- 6. Hybrid Recommendation Function (Content + Rating Rank) ---
def get_hybrid_recommendations(title, top_n=10):
    cleaned_title_input = clean_text(title)

    # Find exact match first based on cleaned title
    matching_indices = data[data['Title_clean'] == cleaned_title_input].index
    if len(matching_indices) == 0:
         # If no exact match, try partial match (case-insensitive)
         matching_indices = data[data['Title'].str.contains(title, case=False, na=False)].index
         if len(matching_indices) == 0:
              return ["No match found. Please check the title."]
         else:
             idx = matching_indices[0] # Take the first partial match
             matched_title = data.iloc[idx]['Title']
             print(f"Found partial match: '{matched_title}'")
    else:
        idx = matching_indices[0] # Use the first exact match
        matched_title = data.iloc[idx]['Title']
        print(f"Found exact match: '{matched_title}'")

    # Compute cosine similarity for the specific movie index vs all others
    cosine_similarities = linear_kernel(tfidf_matrix[idx:idx+1], tfidf_matrix).flatten()

    # Get indices of top N *most similar* movies (content-wise)
    # We get top_n + 1 to exclude the movie itself later
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1] # Exclude the first one (itself)

    # Get details of similar movies
    similar_movies = data.iloc[similar_indices][['Title', 'genres', 'Rating', 'Year']].copy() # Include Year
    similar_movies['similarity_score'] = cosine_similarities[similar_indices]

    # Hybrid Part: Re-rank based on Rating (higher rating is better)
    # Simple re-ranking: Sort the top N similar movies by Rating descending
    hybrid_recommendations = similar_movies.sort_values(by='Rating', ascending=False)

    # Format output
    recommendations_list = []
    for i, row in hybrid_recommendations.iterrows():
        recommendations_list.append({
            "Title": row['Title'],
            "genres": row['genres'],
            "Rating": f"{row['Rating']:.1f}", # Format rating
            "Year": int(row['Year']) if pd.notna(row['Year']) else 'N/A', # Add Year
            "Similarity": f"{row['similarity_score']:.2f}" # Show similarity score
        })

    if not recommendations_list:
         return ["Could not find recommendations for the matched movie."]

    return recommendations_list

In [10]:
# --- 7. GUI Setup ---
print("\n--- Launching GUI ---")

# Create the main window FIRST
root = tk.Tk()
root.title("Hybrid Movie Recommendation System (Content + Rating)")
root.geometry("900x650") # Adjusted size

# Style setup 
style = ttk.Style(root) 
try:
    style.theme_use("clam")
except tk.TclError:
    print("Clam theme not available, using default.")
style.configure("Treeview.Heading", font=("Calibri", 12, "bold"))
style.configure("Treeview", font=("Calibri", 11), rowheight=25)

# Search bar Frame 
search_frame = tk.Frame(root, pady=15)
search_frame.pack(fill=tk.X)

search_label = tk.Label(search_frame, text="Enter Movie Title:", font=("Calibri", 12))
search_label.pack(side=tk.LEFT, padx=(20, 5))

search_entry = tk.Entry(search_frame, width=60, font=("Calibri", 12))
search_entry.pack(side=tk.LEFT, padx=5, ipady=4) # ipady for internal padding

# Function to handle search action
def show_recommendations_gui():
    title = search_entry.get().strip()
    if not title:
        messagebox.showerror("Input Error", "Please enter a movie title.")
        return

    # Clear previous results
    for row in result_tree.get_children():
        result_tree.delete(row)

    # Get recommendations
    print(f"Searching recommendations for: '{title}'") # Debug print
    results = get_hybrid_recommendations(title, top_n=10) # Get top 10
    print(f"Results obtained: {results}") # Debug print

    # Display results or message
    if isinstance(results[0], str): # Handle "No match found" or other messages
        result_tree.insert("", "end", values=(results[0], "", "", "", ""))
    else:
        for result in results:
             result_tree.insert("", "end", values=(
                 result['Title'],
                 result['genres'],
                 result['Rating'],
                 result['Year'], # Display Year
                 result['Similarity'] # Display Similarity
            ))

search_button = tk.Button(search_frame, text="Search", command=show_recommendations_gui, font=("Calibri", 12, "bold"), relief=tk.GROOVE, padx=10)
search_button.pack(side=tk.LEFT, padx=(5, 20))

# Results display Frame
result_frame = tk.Frame(root, padx=20, pady=10)
result_frame.pack(fill=tk.BOTH, expand=True)

# Define columns for the Treeview
columns = ("Title", "Genres", "Rating", "Year", "Similarity")
result_tree = ttk.Treeview(result_frame, columns=columns, show="headings")

# Define headings
result_tree.heading("Title", text="Title")
result_tree.heading("Genres", text="Genres")
result_tree.heading("Rating", text="Rating")
result_tree.heading("Year", text="Year")
result_tree.heading("Similarity", text="Similarity")

# Configure column widths
result_tree.column("Title", width=300, anchor=tk.W)
result_tree.column("Genres", width=250, anchor=tk.W)
result_tree.column("Rating", width=60, anchor=tk.CENTER)
result_tree.column("Year", width=60, anchor=tk.CENTER)
result_tree.column("Similarity", width=80, anchor=tk.CENTER)

# Add scrollbar
scrollbar = ttk.Scrollbar(result_frame, orient=tk.VERTICAL, command=result_tree.yview)
result_tree.configure(yscroll=scrollbar.set)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

result_tree.pack(fill=tk.BOTH, expand=True)

# Run the main loop
root.mainloop()

print("\n--- GUI Closed ---")


--- Launching GUI ---
Searching recommendations for: 'dune'
Found exact match: 'Dune'
Results obtained: [{'Title': 'Interstellar', 'genres': "['Adventure Epic', 'Epic', 'Quest', 'Sci-Fi Epic', 'Space Sci-Fi', 'Adventure', 'Drama', 'Sci-Fi']", 'Rating': '8.7', 'Year': 2014, 'Similarity': '0.38'}, {'Title': 'Dune: Part Two', 'genres': "['Action Epic', 'Desert Adventure', 'Epic', 'Sci-Fi Epic', 'Space Sci-Fi', 'Action', 'Adventure', 'Drama', 'Sci-Fi']", 'Rating': '8.5', 'Year': 2024, 'Similarity': '0.50'}, {'Title': '2001: A Space Odyssey', 'genres': "['Adventure Epic', 'Artificial Intelligence', 'Sci-Fi Epic', 'Space Sci-Fi', 'Adventure', 'Sci-Fi']", 'Rating': '8.3', 'Year': 1968, 'Similarity': '0.37'}, {'Title': 'Dune: Part One', 'genres': "['Desert Adventure', 'Sci-Fi Epic', 'Space Sci-Fi', 'Action', 'Adventure', 'Drama', 'Sci-Fi']", 'Rating': '8.0', 'Year': 2021, 'Similarity': '0.50'}, {'Title': 'Planet of the Apes', 'genres': "['Dystopian Sci-Fi', 'Sci-Fi Epic', 'Space Sci-Fi', 'Tim