In [1]:
import os
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# Ensure NLTK resources are available only once
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Set up stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [2]:
def clean_text(text):
    """Clean text by lowercasing, removing non-alphabetic characters,
    removing stopwords, and applying lemmatization."""
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

# Path setup
base_dir = os.getcwd()
file_path = os.path.join(base_dir, "movies.csv")

# Read the CSV file
df = pd.read_csv(file_path)
# Remove unwanted columns (if exists)
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

In [None]:
def train_and_pickle(csv_path, output_path):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Clean text columns (apply the clean_text function)
    df['cleaned_overview'] = df['overview'].apply(clean_text)
    df['genres'] = df['genres'].apply(clean_text)
    
    # Create a 'tags' column by combining 'genres' and 'cleaned_overview'
    df["tags"] = df["genres"] + ' ' + df["cleaned_overview"]
    df['tags'] = df['tags'].fillna('')  # Fill any NaN values with an empty string
    
    df.reset_index(inplace=True)
    # Initialize the TF-IDF vectorizer with desired parameters
    tfidf = TfidfVectorizer(
        max_df=0.8,        # Ignore overly common terms
        min_df=5,          # Ignore rare terms (in fewer than 5 docs)
        ngram_range=(1, 2),# Use unigrams + bigrams
        stop_words='english'
    )
    
    # Fit and transform the tags into a TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(df['tags'])
    
    # Compute cosine similarity between movies using the TF-IDF matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Save the TF-IDF matrix and cosine similarity matrix in the pickle file
    try:
        with open(output_path, 'wb') as f:
            pickle.dump((tfidf_matrix, cosine_sim), f)
        print(f"Pickle file saved successfully as {output_path}")
    except Exception as e:
        print(f"Error saving the pickle file: {e}")

In [4]:

# Train and pickle the model
train_and_pickle(file_path)

# Later when you load your pickled model, simply do:
try:
    with open('tfidf_model3.pkl', 'rb') as f:
        tfidf_matrix, cosine_sim = pickle.load(f)
    print("Pickle file loaded successfully.")
except Exception as e:
    print(f"Error loading the pickle file: {e}")

Pickle file saved successfully as tfidf_model3.pkl
Pickle file loaded successfully.


In [None]:
def load_pickled_model():
    # Load the dataframe from CSV
    df = pd.read_csv(os.path.join(base_dir, "movies.csv"))
    
    # Unpack only the two objects (tfidf_matrix and cosine_sim) from the pickle file
    tfidf_matrix, cosine_sim = pickle.load(
        open(os.path.join(base_dir, "tfidf_model3.pkl"), "rb")
    )
    
    return tfidf_matrix, cosine_sim, df

def content_based_recommender_with_pickled_model(
    movie_title, top_n=5, tfidf=None, cosine_sim=None, df=None
):
    # Create a mapping of movie titles to index
    indices = pd.Series(df.index, index=df["title"]).drop_duplicates()

    # Get the index of the input movie title
    idx = indices.get(movie_title, None)
    if idx is None:
        return f"Movie '{movie_title}' not found in dataset."

    # Get similarity scores and sort them
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get all similar movies (excluding the input movie itself)
    sim_indices = [i[0] for i in sim_scores[1:21]]  # Get top 20 for replacement pool
    
    # Ensure we return only the top_n recommendations
    sim_indices = sim_indices[:top_n]  # Slice to return only top_n recommendations
    
    # Return recommended movies
    return df["title"].iloc[sim_indices].tolist()

# Load the pickled model and DataFrame
tfidf_matrix, cosine_sim, df = load_pickled_model()

# Example usage: get top 5 recommended movies for "Dune"
recommended_movies = content_based_recommender_with_pickled_model(movie_title="Dune: Part Two", top_n=5, tfidf=tfidf_matrix, cosine_sim=cosine_sim, df=df)
print(recommended_movies)

['The Wolf of Wall Street', "Mothers' Instinct", 'Carry-On', 'The Collection', 'Mortal Kombat']
