In [2]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
df2=pd.read_csv("./tmdb_5000_movies.csv")

In [7]:
# Specify the CSV file path
csv_file_path = 'tmdb_5000_movies.csv'  # Replace with your actual file path

# Read the CSV file while selecting only the required columns
df = pd.read_csv(csv_file_path, usecols=['id', 'title', 'overview'])

# Optionally drop rows with missing overviews to avoid errors during vectorisation
df.dropna(subset=['overview'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Display the first few rows to verify the correct columns are loaded
df.head()


Unnamed: 0,id,overview,title
0,19995,"In the 22nd century, a paraplegic Marine is di...",Avatar
1,285,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End
2,206647,A cryptic message from Bond’s past sends him o...,Spectre
3,49026,Following the death of District Attorney Harve...,The Dark Knight Rises
4,49529,"John Carter is a war-weary, former military ca...",John Carter


In [9]:
# Initialise the TfidfVectoriser using English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'overview' column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Compute cosine similarity matrix from the TF-IDF matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)



In [18]:
# Create a Series mapping movie titles to DataFrame indices
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    """
    Given a movie title, return the top recommendations based on plot similarity.

    Parameters:
        title (str): The title of the movie.
        cosine_sim (ndarray): Precomputed cosine similarity matrix.
        
    Returns:
        list: A list of recommended movie titles.
    """
    # Retrieve the index for the movie that matches the title
    idx = indices.get(title)
    
    # If the title is not found, return an empty list
    if idx is None:
        print(f"Title '{title}' not found in dataset.")
        return []
    
    # Enumerate over the similarity scores and sort them in descending order
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the first entry (the movie itself) and select the next top 10 recommendations
    sim_scores = sim_scores[1:11]
    
    # Retrieve the indices of the recommended movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the titles of the recommended movies
    return df['title'].iloc[movie_indices].tolist(), sim_scores


In [None]:
# Test the recommendation function with an example title
example_title = "Forrest Gump"  # Replace with an actual title from your dataset
recommendations, sscores = get_recommendations(example_title)

print(f"Recommendations for '{example_title}':")
for rec in recommendations:
    print(rec)
print(sscores)


Recommendations for 'Forrest Gump':
An American Haunting
Heaven is for Real
Niagara
The Shaggy Dog
Room
Frozen
Love in the Time of Cholera
Lovely, Still
Mr. Bean's Holiday
I Married a Strange Person!
[(2685, np.float64(0.15962640204016756)), (2903, np.float64(0.11886080080825893)), (4285, np.float64(0.11589609863543196)), (725, np.float64(0.1042650018990765)), (2759, np.float64(0.1033232292122484)), (124, np.float64(0.0996770447464312)), (1098, np.float64(0.09814023050733961)), (3648, np.float64(0.09539959674535414)), (1871, np.float64(0.09310460038833449)), (4638, np.float64(0.09093249895809938))]


: 