In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import h5py
import joblib
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the cleaned data and models
df = pd.read_csv('cleaned_data_final.csv')
with h5py.File('tfidf_matrix.h5', 'r') as hdf:
    tfidf_matrix = hdf['tfidf_matrix'][:]
with h5py.File('cosine_sim_matrix.h5', 'r') as hdf:
    cosine_sim = hdf['cosine_sim_matrix'][:]
with h5py.File('svd_matrix.h5', 'r') as hdf:
    svd_matrix = hdf['svd_matrix'][:]
svd = joblib.load('svd_model.pkl')

  df = pd.read_csv('cleaned_data_final.csv')


In [3]:
# Load sentiment analysis model
sentiment_analysis = pipeline("sentiment-analysis")

# Load GPT-like model and tokenizer for extracting movie titles
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [4]:
# Function to analyze sentiment
def analyze_sentiment(query):
    result = sentiment_analysis(query)[0]
    sentiment = "negative" if result['label'] == 'NEGATIVE' else "positive"
    score = result['score']
    return sentiment, score

In [5]:
# Function to extract movie titles using GPT-2
def extract_movie_titles(query):
    inputs = tokenizer(query, return_tensors='pt')
    outputs = model.generate(**inputs, max_length=50, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Assuming the response format: "Titles: [title1, title2]"
    if "Titles:" in generated_text:
        titles = generated_text.replace("Titles:", "").strip().split(',')
        return [title.strip() for title in titles]
    else:
        return []

In [6]:
# Function to perform hybrid recommendations
def hybrid_recommendation(title, df, svd_matrix, cosine_sim_matrix, alpha=0.5, top_n=5):
    idx = df.index[df['title'].str.lower() == title.lower()].tolist()
    if idx:
        idx = idx[0]
        svd_sim = cosine_similarity(svd_matrix[idx].reshape(1, -1), svd_matrix).flatten()
        hybrid_scores = alpha * cosine_sim_matrix[idx] + (1 - alpha) * svd_sim
        top_indices = np.argsort(hybrid_scores)[-top_n-1:-1][::-1]
        return df.iloc[top_indices]
    return pd.DataFrame()

In [7]:
# Function to add diversity and serendipity to recommendations
def add_diversity_and_serendipity(recommendations, df, num_diverse=5, num_serendipitous=5):
    diverse_recs = df.sample(n=num_diverse)
    serendipitous_recs = df[df['tomatoMeter'] > 0.7].sample(n=num_serendipitous) 
    combined_recs = pd.concat([recommendations, diverse_recs, serendipitous_recs])
    final_recommendations = combined_recs.drop_duplicates().head(5).reset_index(drop=True)
    return final_recommendations

In [8]:
# Main function to recommend movies
def recommend_movies(query):
    sentiment, sentiment_score = analyze_sentiment(query)
    titles = extract_movie_titles(query)

    recommendations = pd.DataFrame()

    # If titles are mentioned, recommend movies similar to them
    if titles:
        for title in titles:
            title_recs = hybrid_recommendation(title, df, svd_matrix, cosine_sim, alpha=0.7)
            if not title_recs.empty:
                recommendations = pd.concat([recommendations, title_recs])
                mentioned_movie = df[df['title'].str.lower() == title.lower()]
                recommendations = pd.concat([mentioned_movie, recommendations])

    # Add diversity and serendipity to the recommendations
    recommendations = add_diversity_and_serendipity(recommendations, df)

    # Output the recommendations in table format
    if not recommendations.empty:
        print(f"Query Sentiment: {sentiment.capitalize()} (Score: {sentiment_score:.2f})\n")
        print(tabulate(recommendations[['title', 'genre']], headers='keys', tablefmt='grid'))
    else:
        print("No recommendations available.")
        
    # Evaluate the recommendations
    genre_diversity = calculate_genre_diversity(recommendations)
    similarity = calculate_similarity(recommendations)
    intra_list_similarity = calculate_intra_list_similarity(recommendations)

    # Print the evaluation metrics
    print(f"Genre Diversity: {genre_diversity:.2f}")
    print(f"Similarity: {similarity:.2f}")
    print(f"Intra-List Similarity: {intra_list_similarity:.2f}")

In [9]:
# Evaluation Metrics
def calculate_genre_diversity(recommendations):
    genres = recommendations['genre'].apply(lambda x: x.split(', '))
    unique_genres = set([genre for sublist in genres for genre in sublist])
    return len(unique_genres) / len(recommendations)


def calculate_similarity(recommendations):
    tfidf_subset = tfidf_matrix[recommendations.index, :]
    similarity_matrix = cosine_similarity(tfidf_subset)
    return np.mean(similarity_matrix)


def calculate_intra_list_similarity(recommendations):
    similarity_matrix = cosine_similarity(svd_matrix[recommendations.index])
    return np.mean(similarity_matrix)

In [10]:
# Example usage
query = "I'm Happy Suggest Me Some Movies to Watch!"
recommended_movies = recommend_movies(query)
print(recommended_movies)

Query Sentiment: Positive (Score: 1.00)

+----+----------------------------------+-----------------------------+
|    | title                            | genre                       |
|  0 | she's beautiful when she's angry | documentary, history, drama |
+----+----------------------------------+-----------------------------+
|  1 | my suicide                       | comedy, drama               |
+----+----------------------------------+-----------------------------+
|  2 | perestroika                      | drama                       |
+----+----------------------------------+-----------------------------+
|  3 | daniel deronda                   | unknown                     |
+----+----------------------------------+-----------------------------+
|  4 | states of grace                  | drama                       |
+----+----------------------------------+-----------------------------+
Genre Diversity: 1.00
Similarity: 0.12
Intra-List Similarity: 0.12
None
