In [None]:
!pip install torch
!pip install gradio
!pip install pandas
!pip install numpy
!pip install rapidfuzz
!pip install scikit-learn
!pip install transformers
!pip install huggingface-hub

import os
import torch
import joblib
import numpy as np
import pandas as pd
from rapidfuzz import process
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
df = pd.read_csv('/content/top10K-TMDB-movies.csv')
df = df.dropna(subset=['title', 'genre', 'original_language', 'overview', 'popularity', 'release_date', 'vote_average', 'vote_count'])

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])

tfidf_cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

cache_file = '/content/bert_embeddings.pkl'
if os.path.exists(cache_file):
    with open(cache_file, 'rb') as f:
        bert_embeddings = joblib.load(f)
else:
    def get_bert_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    with ThreadPoolExecutor() as executor:
        bert_embeddings = list(executor.map(get_bert_embeddings, df['overview'].tolist()))

    bert_embeddings = np.vstack(bert_embeddings)
    with open(cache_file, 'wb') as f:
        joblib.dump(bert_embeddings, f)

bert_cosine_sim = cosine_similarity(bert_embeddings, bert_embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def find_closest_title(input_title, titles):
    input_title = input_title.lower()
    titles_lower = [title.lower() for title in titles]
    closest_title = process.extractOne(input_title, titles_lower)
    if closest_title:
        return titles[titles_lower.index(closest_title[0])]
    return None

In [None]:
def hybrid_recommendation(input_title, cosine_sim=tfidf_cosine_sim, bert_cosine_sim=bert_cosine_sim):
    titles = df['title'].tolist()
    closest_title = find_closest_title(input_title, titles)

    if closest_title is None:
        return "Movie Not Found.", None

    idx = df.index[df['title'] == closest_title].tolist()[0]

    tfidf_sim_scores = list(enumerate(cosine_sim[idx]))
    bert_sim_scores = list(enumerate(bert_cosine_sim[idx]))

    combined_sim_scores = [(i, tfidf_sim_scores[i][1] + bert_sim_scores[i][1]) for i in range(len(tfidf_sim_scores))]
    combined_sim_scores = sorted(combined_sim_scores, key=lambda x: x[1], reverse=True)

    movie_indices = [i[0] for i in combined_sim_scores[1:11]]
    recommended_movies = df.iloc[movie_indices]

    return recommended_movies[['title', 'genre', 'original_language', 'overview', 'popularity', 'release_date', 'vote_average', 'vote_count']].to_dict('records'), combined_sim_scores

In [None]:
def recommend_movie(movie_title):
    recommendations, combined_sim_scores = hybrid_recommendation(movie_title)
    if isinstance(recommendations, str):
        return recommendations, []
    result = ""
    for rec in recommendations:
        result += "\n\n---\n\n"
        result += f"**Title:** {rec['title']}\n\n"
        result += f"**Genre:** {rec['genre']}\n\n"
        result += f"**Original Language:** {rec['original_language']}\n\n"
        result += f"**Overview:** {rec['overview']}\n\n"
        result += f"**Popularity:** {rec['popularity']}\n\n"
        result += f"**Release Date:** {rec['release_date']}\n\n"
        result += f"**Vote Average:** {rec['vote_average']}\n\n"
        result += f"**Vote Count:** {rec['vote_count']}\n\n"

    similarity_scores = {df['title'].iloc[score[0]]: score[1] for score in combined_sim_scores[1:11]}
    return similarity_scores, result

In [None]:
import gradio as gr

with gr.Blocks() as iface:
    gr.Markdown("# <p style='text-align: center;'>Movie Recommendation System</p>")
    gr.Markdown("Enter The Movie Title and System Will Recommend Similar Movie")

    with gr.Row():
        movie_title = gr.Dropdown(choices=df['title'].tolist(), label="Select The Title of The Movie")

    submit_btn = gr.Button("Generate")

    similarity_scores = gr.JSON(label="Similarity Score")
    recommendation = gr.Markdown(label="Recommendation")

    submit_btn.click(recommend_movie, inputs=movie_title, outputs=[similarity_scores, recommendation])

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e43940fdea4be37196.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


