In [None]:
import pandas as pd

# Load TMDB datasets
tmdb_movies = pd.read_csv("../data/tmdb_5000_movies.csv")
tmdb_credits = pd.read_csv("../data/tmdb_5000_credits.csv")

# Load MovieLens ratings
ratings = pd.read_csv("../data/ratings.csv")

print("TMDB Movies:", tmdb_movies.shape)
print("TMDB Credits:", tmdb_credits.shape)
print("Ratings:", ratings.shape)

# Merge TMDB movies and credits on movie ID
movies = tmdb_movies.merge(tmdb_credits, left_on="id", right_on="movie_id")

# print(movies.shape)
movies.head(2)

# Select relevant columns
movies = movies[['id','original_title','overview','genres','keywords','cast','crew']]
movies.head(2)

# Convert JSON-like text to lists
import ast

def extract_names(obj):
    try:
        items = ast.literal_eval(obj)
        return [i['name'].lower().replace(" ", "") for i in items]
    except:
        return []
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)

movies[['original_title','genres','keywords']].head(2)

#   Extract top 3 cast members
def get_top3_cast(obj):
    try:
        items = ast.literal_eval(obj)
        names = [i['name'].lower().replace(" ", "") for i in items[:3]]
        return names
    except:
        return []
    
movies['cast'] = movies['cast'].apply(get_top3_cast)
movies[['original_title','cast']].head(2)

#   Extract director from crew
def get_director(obj):
    try:
        items = ast.literal_eval(obj)
        for i in items:
            if i['job'] == 'Director':
                return [i['name'].lower().replace(" ", "")]
        return []
    except:
        return []

movies['director'] = movies['crew'].apply(get_director)
movies[['original_title','director']].head(2)

#   Process overview text
movies['overview'] = movies['overview'].fillna("").apply(lambda x: x.lower().split())
movies[['original_title','overview']].head(2)

#   Create movie profile by combining all features
movies['movie_profile'] = (
    movies['overview'] +
    movies['genres'] +
    movies['keywords'] +
    movies['cast'] +
    movies['director']
)
movies['movie_profile'] = movies['movie_profile'].apply(lambda x: " ".join(x))
movies[['original_title','movie_profile']].head(2)

#   Final selection of columns
final_movies = movies[['id','original_title','movie_profile']]
final_movies.head()

Unnamed: 0,id,original_title,movie_profile
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."
