In [31]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

base_dir = os.getcwd()

file_path = os.path.join(base_dir, "movies.csv")

df = pd.read_csv(file_path)

In [32]:
def content_based_recommender(csv_path, movie_title, top_n=5):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Create tags column by combining 'genres' and 'overview'
    df["tags"] = df["genres"] + ' ' + df["overview"]
    
    # Fill NaN values in 'tags' column with an empty string
    df['tags'] = df['tags'].fillna('')
    
    # Convert movie overviews into TF-IDF vectors
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = tfidf.fit_transform(df['tags'])
    
    # Compute cosine similarity between movies
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Create a mapping of movie titles to index
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    
    # Get the index of the input movie title
    idx = indices.get(movie_title, None)
    if idx is None:
        return f"Movie '{movie_title}' not found in dataset."
    
    # Get similarity scores and sort them
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of top N similar movies (excluding the input movie itself)
    sim_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    # Return top recommended movies
    return df['title'].iloc[sim_indices].tolist()

# Example usage:
csv_path = file_path
recommended_movies = content_based_recommender(csv_path, "Inception", top_n=5)
print(recommended_movies)


['The Hunger Games: Mockingjay - Part 1', 'Fahrenheit 451', 'Transformers: Revenge of the Fallen', 'Pitch Perfect 2', 'Billion Dollar Limited']


In [None]:
import pickle
def train_and_pickle(csv_path):
    df = pd.read_csv(csv_path)
    
    # Create tags column by combining 'genres' and 'overview'
    df["tags"] = df["genres"] + ' ' + df["overview"]
    
    # Fill NaN values in 'tags' column with an empty string
    df['tags'] = df['tags'].fillna('')
    
    # Convert movie overviews into TF-IDF vectors
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = tfidf.fit_transform(df['tags'])
    
    # Compute cosine similarity between movies
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Save the TfidfVectorizer and cosine similarity matrix to a .pkl file
    with open('tfidf_model.pkl', 'wb') as f:
        pickle.dump((tfidf, cosine_sim), f)