In [1]:
import requests
import pandas as pd

# Replace with your TMDB API Key
API_KEY = "c13"

# Number of pages to fetch (each page = 20 movies)
NUM_PAGES = 50  # Adjust to collect more/less data

# Empty list to store movies
movies = []

# Loop through multiple pages
for page in range(1, NUM_PAGES + 1):
    url = f"https://api.themoviedb.org/3/movie/popular?api_key={API_KEY}&language=en-US&page={page}"
    response = requests.get(url)
    data = response.json()
    
    # Extract movie details
    for movie in data["results"]:
        movies.append({
            "Title": movie["title"],
            "Genre IDs": movie["genre_ids"],  
            "Overview": movie["overview"],
            "Popularity": movie["popularity"],
            "Release Date": movie["release_date"],
            "Vote Average": movie["vote_average"]
        })

# Convert to DataFrame
df_movies = pd.DataFrame(movies)

# Save to CSV file
df_movies.to_csv("tmdb_movies.csv", index=False)

print(f"✅ Dataset saved! {len(df_movies)} movies collected.")


✅ Dataset saved! 1000 movies collected.


In [3]:
df_movies.head()

Unnamed: 0,Title,Genre IDs,Overview,Popularity,Release Date,Vote Average
0,Cosmic Chaos,"[53, 878]","Battles in virtual reality, survival in a post...",573.0464,2023-08-03,5.2
1,Cleaner,"[28, 53]",When a group of radical activists take over an...,572.5151,2025-02-19,6.495
2,Captain America: Brave New World,"[28, 53, 878]",After meeting with newly elected U.S. Presiden...,421.1844,2025-02-12,6.1
3,The Codes of War,"[28, 10752]","War stories about family, ethics and honor inc...",422.4411,2025-03-20,8.1
4,Snow White,"[10751, 14]",Princess Snow White flees the castle when the ...,278.4259,2025-03-19,4.4


In [5]:
df_movies.isnull()

Unnamed: 0,Title,Genre IDs,Overview,Popularity,Release Date,Vote Average
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
995,False,False,False,False,False,False
996,False,False,False,False,False,False
997,False,False,False,False,False,False
998,False,False,False,False,False,False


In [7]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         1000 non-null   object 
 1   Genre IDs     1000 non-null   object 
 2   Overview      1000 non-null   object 
 3   Popularity    1000 non-null   float64
 4   Release Date  1000 non-null   object 
 5   Vote Average  1000 non-null   float64
dtypes: float64(2), object(4)
memory usage: 47.0+ KB


In [9]:
import requests
import pandas as pd
import ast  # To safely convert string list to actual list

# 🔹 Replace with your TMDB API Key (MUST be inside quotes)
API_KEY = "c1d37"

# 🔹 Load your existing dataset
df = pd.read_csv("tmdb_movies.csv")

# 🔹 TMDB API URL to fetch genre mappings
genre_url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={API_KEY}&language=en-US"

# 🔹 Fetch genre data from TMDB API
response = requests.get(genre_url)
genres_data = response.json()

# 🔹 Create a dictionary: {genre_id: genre_name}
genre_dict = {genre["id"]: genre["name"] for genre in genres_data["genres"]}

# 🔹 Function to convert Genre IDs to Genre Names
def map_genres(genre_ids):
    try:
        genre_ids = ast.literal_eval(genre_ids) if isinstance(genre_ids, str) else genre_ids
        return [genre_dict.get(genre_id, "Unknown") for genre_id in genre_ids]
    except:
        return ["Unknown"]

# 🔹 Apply genre mapping
df["Genres"] = df["Genre IDs"].apply(map_genres)

# 🔹 Drop the old Genre IDs column
df.drop(columns=["Genre IDs"], inplace=True)

# 🔹 Save the cleaned dataset
df.to_csv("tmdb_movies_cleaned.csv", index=False)

print(f"✅ Genre mapping done! Dataset updated & saved as 'tmdb_movies_cleaned.csv'.")


✅ Genre mapping done! Dataset updated & saved as 'tmdb_movies_cleaned.csv'.


In [None]:
# prompt: checks for null valus 

# Check for null values
print("\nNull value statistics:\n", df.isnull().sum())


In [None]:
# Fill missing Overviews
df['Overview'].fillna("No overview available", inplace=True)

# Fill missing Release Date with 'Unknown'
df['Release Date'].fillna("Unknown", inplace=True)

# Verify again
print(df.isnull().sum())


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("tmdb_movies_cleaned.csv")

# Ensure missing overviews are handled
df['Overview'].fillna("No overview available", inplace=True)

# Step 1: Convert text data (Overview) into numerical representation
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df['Overview'])

# Step 2: Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 3: Create a function to recommend movies
def recommend_movies(title, df, cosine_sim=cosine_sim):
    # Get the index of the movie
    idx = df[df['Title'] == title].index[0]

    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 recommendations

    # Get recommended movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return recommended movie titles
    return df['Title'].iloc[movie_indices]

# Test the recommendation system
movie_name = "Cosmic Chaos"  # Change this to any movie title in your dataset
print("Movies similar to", movie_name, ":\n", recommend_movies(movie_name, df))


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("tmdb_movies_cleaned.csv")
df = df.copy()  # Ensure changes are applied to the original DataFrame

# Handle missing values
df['Overview'] = df['Overview'].fillna("No overview available")
df['Genres'] = df['Genres'].fillna("[]")  # Ensure Genres is not empty

# Convert Genres from list format (e.g., "['Action', 'Comedy']") to a string
df['Genres'] = df['Genres'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else "")

# Create a combined text column
df['Combined_Text'] = df['Overview'] + " " + df['Genres']

# TF-IDF Vectorization on Combined Text
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df['Combined_Text'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend movies
def recommend_movies(title, df, cosine_sim=cosine_sim):
    if title not in df['Title'].values:
        return "Movie not found in dataset."

    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 recommendations
    movie_indices = [i[0] for i in sim_scores]

    return df['Title'].iloc[movie_indices]

In [None]:
= """import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
import os

# Load dataset (ensure correct path)
file_path = os.path.join(os.path.dirname(__file__), "tmdb_movies_cleaned.csv")
df = pd.read_csv(file_path)

# Fill missing values
df['Overview'] = df['Overview'].fillna("No overview available")

# Combine text features
df['Combined'] = df['Overview'] + " " + df['Genres']

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df['Combined'])

# Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get recommendations
def get_recommendations(title):
    if title not in df['Title'].values:
        closest_match = process.extractOne(title, df['Title'].values)
        if closest_match and closest_match[1] > 60:
            title = closest_match[0]
        else:
            return ["Movie not found! Try another title."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommended_movies = df['Title'].iloc[movie_indices].tolist()
    
    return recommended_movies

# Gradio Interface
interface = gr.Interface(
    fn=get_recommendations,
    inputs=gr.Textbox(label="Enter Movie Name"),
    outputs=gr.List(label="Recommended Movies"),
    title="🎬 Movie Recommendation System",
    description="Enter a movie title, and get 5 similar movies based on their descriptions and genres."
)

# Launch the app
interface.launch()
"""

with open("app.py", "w") as file:
    file.write(code)

print("✅ app.py has been saved!")


In [None]:
reqs = """gradio
pandas
scikit-learn
fuzzywuzzy
python-Levenshtein"""

with open("requirements.txt", "w") as file:
    file.write(reqs)

print("✅ requirements.txt has been saved!")
