In [38]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

def load_and_merge_data(credits_path, movies_path):
    """Load and merge TMDB datasets"""
    credits = pd.read_csv(credits_path)
    movies = pd.read_csv(movies_path)
    return pd.merge(credits, movies, left_on='movie_id', right_on='id').drop(['title_x','movie_id'], axis=1).rename(columns={'title_y': 'title'})

df = load_and_merge_data(
    '/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv',
    '/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv'
)
df

Unnamed: 0,cast,crew,budget,genres,homepage,id,keywords,original_language,original_title,overview,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de...",220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,...,"[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238
4799,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de...",9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,...,[],2011-12-26,0,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5
4800,"[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de...",0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-10-13,0,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6
4801,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de...",0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-05-03,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7


In [32]:

def process_metadata(df):
    """Process JSON-like columns to extract key features"""
    # Extract director
    df['director'] = df['crew'].apply(lambda x: next((m['name'] for m in literal_eval(x) if m['job'] == 'Director'), np.nan))
    
    # Extract top 3 cast members
    df['cast'] = df['cast'].apply(lambda x: [m['name'] for m in literal_eval(x)[:3]])
    
    # Extract top 5 keywords
    df['keywords'] = df['keywords'].apply(lambda x: [m['name'] for m in literal_eval(x)[:5]])
    
    # Extract genres
    df['genres'] = df['genres'].apply(lambda x: [m['name'] for m in literal_eval(x)])
    
    return df

def clean_text_features(df):
    """Clean and normalize text features"""
    text_features = ['cast', 'keywords', 'director', 'genres']
    
    def clean_feature(x):
        if isinstance(x, list):
            return [str.lower(i.replace(" ", "")) for i in x]
        elif isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        return ''
    
    for feature in text_features:
        df[feature] = df[feature].apply(clean_feature)
    
    df['overview'] = df['overview'].fillna('')
    return df

def create_hybrid_soup(row):
    """Combine multiple features into a metadata soup"""
    return ' '.join(row['keywords']) + ' ' + \
           ' '.join(row['cast']) + ' ' + \
           row['director'] + ' ' + \
           ' '.join(row['genres']) + ' ' + \
           row['overview']

def create_similarity_matrices(df):
    """Create hybrid similarity matrix using TF-IDF and CountVectorizer"""
    # TF-IDF for overviews
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = tfidf.fit_transform(df['overview'])
    
    # CountVectorizer for metadata soup
    count = CountVectorizer(stop_words='english', max_features=5000)
    soup_matrix = count.fit_transform(df['soup'])
    
    overview_weight = 0.4
    metadata_weight = 0.6
    
    overview_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    metadata_sim = cosine_similarity(soup_matrix, soup_matrix)
    
    hybrid_sim = overview_weight * overview_sim + metadata_weight * metadata_sim
    return hybrid_sim

# Recommendation Function
def get_content_based_recommendations(title, df, cosine_sim, top_n=10):
    """Generate recommendations using hybrid similarity matrix"""
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    
    try:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]
        
        movie_indices = [i[0] for i in sim_scores]
        recommendations = df[['title', 'director', 'genres', 'vote_average']].iloc[movie_indices]
        recommendations['similarity_score'] = [i[1] for i in sim_scores]
        
        return recommendations
    except KeyError:
        return "Movie not found in database"

# Main Execution Flow
def main():
    df = load_and_merge_data(
        '/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv',
        '/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv'
    )
    df = process_metadata(df)
    df = clean_text_features(df)
    
    # Create hybrid features
    df['soup'] = df.apply(create_hybrid_soup, axis=1)
    
    # Compute similarity matrices
    cosine_sim = create_similarity_matrices(df)
    
    # Generate recommendations
    print("\nRecommendations for 'Avatar':")
    display(get_content_based_recommendations('Avatar', df, cosine_sim))
    
    print("\nRecommendations for 'Inception':")
    display(get_content_based_recommendations('Inception', df, cosine_sim))



In [33]:
main()


Recommendations for 'Avatar':


Unnamed: 0,title,director,genres,vote_average,similarity_score
3604,Apollo 18,gonzalolópez-gallego,"[horror, thriller, sciencefiction]",5.0,0.171072
466,The Time Machine,simonwells,"[sciencefiction, adventure, action]",5.8,0.165824
151,Beowulf,robertzemeckis,"[adventure, action, animation]",5.5,0.152942
942,The Book of Life,jorger.gutierrez,"[romance, animation, adventure, comedy, family...",7.3,0.151377
1438,Krull,peteryates,"[fantasy, action, adventure]",5.8,0.149603
4401,The Helix... Loaded,,"[action, comedy, sciencefiction]",4.8,0.14771
495,Journey 2: The Mysterious Island,bradpeyton,"[adventure, action, sciencefiction]",5.8,0.145488
1191,Small Soldiers,joedante,"[comedy, adventure, fantasy, sciencefiction, a...",6.2,0.143019
2995,Mad Max Beyond Thunderdome,georgemiller,"[action, adventure, sciencefiction]",5.9,0.141915
1341,The Inhabited Island,fyodorbondarchuk,"[action, fantasy, sciencefiction, thriller]",5.3,0.140561



Recommendations for 'Inception':


Unnamed: 0,title,director,genres,vote_average,similarity_score
1267,Duplex,dannydevito,"[action, comedy, thriller]",5.9,0.14736
35,Transformers: Revenge of the Fallen,michaelbay,"[sciencefiction, action, adventure]",6.0,0.140052
134,Mission: Impossible - Rogue Nation,christophermcquarrie,"[action, adventure, thriller]",7.1,0.137775
2897,Cypher,vincenzonatali,"[thriller, sciencefiction, mystery]",6.7,0.132198
2815,Star Trek II: The Wrath of Khan,nicholasmeyer,"[action, adventure, sciencefiction, thriller]",7.3,0.132156
1568,Looper,rianjohnson,"[action, thriller, sciencefiction]",6.6,0.130533
3972,Chicago Overcoat,briancaunter,"[action, crime, thriller]",6.1,0.129119
2156,Nancy Drew,andrewfleming,"[action, adventure, crime, family, mystery, th...",5.8,0.127646
4401,The Helix... Loaded,,"[action, comedy, sciencefiction]",4.8,0.126491
1425,Abduction,johnsingleton,"[thriller, action, mystery]",5.6,0.124222
