In [10]:
import os
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

base_dir = os.getcwd()
file_path = os.path.join(base_dir, "movies.csv")

In [11]:
def preprocess_movies(df):
    df = df[df['vote_average'] != 0.0]
    
    # Combine genres and overview into a single text column
    df["tags"] = (df["genres"].fillna('') + ' ' + df["overview"].fillna('')).str.strip()
    
    return df

In [12]:
def content_based_recommender(csv_path, movie_title, top_n=5):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Drop unnecessary column if exists
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

    df = preprocess_movies(df)

    # TF-IDF vectorization
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = tfidf.fit_transform(df['tags'])
    
    # Cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Map movie titles to indices
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    
    idx = indices.get(movie_title)
    if idx is None:
        return f"Movie '{movie_title}' not found in dataset."
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    return df['title'].iloc[sim_indices].tolist()

# Example usage
recommended_movies = content_based_recommender(file_path, "Inception", top_n=5)
print(recommended_movies)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = (df["genres"].fillna('') + ' ' + df["overview"].fillna('')).str.strip()


['Addicted', 'Protocol', 'She-Devil', 'Tarzan II', "Lady Chatterley's Lover"]


In [13]:
def train_and_pickle(csv_path, output_path):
    df = pd.read_csv(csv_path)
    
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)
    
    df = preprocess_movies(df)
    
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2))
    tfidf_matrix = tfidf.fit_transform(df['tags'])
    
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Save model and similarity matrix
    try:
        with open(output_path, 'wb') as f:
            pickle.dump((tfidf, cosine_sim), f)
        print(f"Model saved successfully as {output_path}")
    except Exception as e:
        print(f"Error saving the model: {e}")

# Call the function to train and pickle the model
train_and_pickle(file_path, 'tfidf_model2.pkl')

# Check if the file was created
if os.path.exists('tfidf_model.pkl'):
    print("Pickle file is saved successfully!")
else:
    print("Pickle file was not created.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = (df["genres"].fillna('') + ' ' + df["overview"].fillna('')).str.strip()


Model saved successfully as tfidf_model2.pkl
Pickle file is saved successfully!
