In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")


In [2]:
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv


In [3]:
movies_df = pd.read_csv("tmdb_5000_movies.csv")


In [4]:
movies_df.head()


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
print(movies_df.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [6]:
print(movies_df.shape)


(4803, 20)


In [7]:
print(movies_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [8]:
def clean_data(data):
    return " ".join([d['name'] for d in eval(data)] if pd.notna(data) else [])

movies_df['genres_cleaned'] = movies_df['genres'].apply(clean_data)
movies_df['keywords_cleaned'] = movies_df['keywords'].apply(clean_data)


In [9]:
movies_df['combined_features'] = (
    movies_df['genres_cleaned'] + " " +
    movies_df['keywords_cleaned'] + " " +
    movies_df['overview'].fillna("")
)


In [11]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [12]:
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()


In [13]:
def recommend_movies(title, cosine_sim=cosine_sim):
    if title not in indices:
        return "Ce film n'est pas dans la base de données."

    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similaires

    movie_indices = [i[0] for i in sim_scores]
    recommendations = movies_df['title'].iloc[movie_indices].tolist()
    return recommendations


In [14]:
print(recommend_movies("Avatar"))


['Mission to Mars', 'Aliens', 'Moonraker', 'Alien³', 'Spaceballs', 'Lifeforce', 'Treasure Planet', 'Lockout', 'Alien', 'Planet of the Apes']


In [16]:
print(recommend_movies("Moonraker"))


['Gravity', 'Avatar', 'You Only Live Twice', 'Lifeforce', 'Mission to Mars', 'Spaceballs', 'Silent Running', 'Lockout', 'Treasure Planet', '2001: A Space Odyssey']


In [17]:
def recommend_movies_advanced(title, cosine_sim=cosine_sim):
    if title not in indices:
        return "Ce film n'est pas dans la base de données."

    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: (x[1], movies_df.iloc[x[0]]['vote_average']), reverse=True)
    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]
    recommendations = movies_df[['title', 'vote_average']].iloc[movie_indices]
    return recommendations


In [18]:
print(recommend_movies_advanced("Avatar"))


                   title  vote_average
373      Mission to Mars           5.7
2403              Aliens           7.7
1531           Moonraker           5.9
838               Alien³           6.2
2015          Spaceballs           6.7
1914           Lifeforce           6.2
305      Treasure Planet           7.2
2198             Lockout           5.8
3158               Alien           7.9
278   Planet of the Apes           5.6


In [19]:
movies_df['combined_features'] = (
    (movies_df['genres_cleaned'] + " ") * 2 +
    (movies_df['keywords_cleaned'] + " ") * 2 +
    movies_df['overview'].fillna("")
)


In [20]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [21]:
def recommend_movies_advanced(title, cosine_sim=cosine_sim, score_threshold=6.0):
    if title not in indices:
        return "Ce film n'est pas dans la base de données."

    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = [
        (i, score) for i, score in sim_scores
        if movies_df.iloc[i]['vote_average'] >= score_threshold
    ]

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]
    recommendations = movies_df[['title', 'vote_average']].iloc[movie_indices]
    return recommendations


In [22]:
print(recommend_movies_advanced("Avatar", score_threshold=6.0))

                        title  vote_average
2403                   Aliens           7.7
838                    Alien³           6.2
4332           Silent Running           6.3
3158                    Alien           7.9
2015               Spaceballs           6.7
47    Star Trek Into Darkness           7.4
305           Treasure Planet           7.2
541                   Soldier           6.1
239                   Gravity           7.3
1951               Space Dogs           6.3


In [33]:
print(recommend_movies_advanced("The Avengers", score_threshold=6.0))

                                   title  vote_average
7                Avengers: Age of Ultron           7.3
79                            Iron Man 2           6.6
26            Captain America: Civil War           7.1
182                              Ant-Man           7.0
511                                X-Men           6.8
85   Captain America: The Winter Soldier           7.6
169   Captain America: The First Avenger           6.6
126                 Thor: The Dark World           6.8
64                     X-Men: Apocalypse           6.4
68                              Iron Man           7.4
