In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [None]:
# Load the IMDb dataset
imdb_data = pd.read_csv('/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv')

In [None]:
imdb_data.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [None]:
# Preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply preprocessing to movie descriptions
imdb_data['processed_description'] = imdb_data['Overview'].apply(preprocess_text)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')##'distilbert-base-uncased'

# Function to extract BERT embeddings
def extract_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Extract BERT embeddings for movie descriptions
imdb_data['bert_embeddings'] = imdb_data['processed_description'].apply(extract_bert_embeddings)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Create movie index
movie_index = {}
for idx, row in imdb_data.iterrows():
    movie_index[row['Series_Title']] = row['bert_embeddings']

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
top_k = 20

In [None]:
# Function to recommend movies based on query
def recommend_movies(query, movie_index, top_k):
    query_embedding = extract_bert_embeddings(preprocess_text(query))
    similarities = {}
    for movie_id, embedding in movie_index.items():
        similarity = cosine_similarity(query_embedding, embedding)
        similarities[movie_id] = similarity
    top_k_similarities = dict(sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k])
    return top_k_similarities

In [None]:
# Example usage
query = 'prisoner'#'Shawshank Redemption'#"Fear can hold you prisoner. Hope can set you free."#"Action-packed thriller"
recommended_movies = recommend_movies(query, movie_index, top_k)
print(recommended_movies)

{'Un prophète': array([[0.5187758]], dtype=float32), 'Judgment at Nuremberg': array([[0.51112735]], dtype=float32), 'Badhaai ho': array([[0.49708766]], dtype=float32), 'Cool Hand Luke': array([[0.48609877]], dtype=float32), 'Shutter Island': array([[0.4850768]], dtype=float32), "Hachi: A Dog's Tale": array([[0.4828335]], dtype=float32), 'The Last Emperor': array([[0.48189312]], dtype=float32), 'Boyhood': array([[0.47773862]], dtype=float32), 'Trois couleurs: Blanc': array([[0.46858677]], dtype=float32), 'Eskiya': array([[0.4655677]], dtype=float32), 'Brokeback Mountain': array([[0.4579839]], dtype=float32), 'The Machinist': array([[0.45765325]], dtype=float32), 'Capharnaüm': array([[0.4518708]], dtype=float32), 'The Purple Rose of Cairo': array([[0.45109624]], dtype=float32), 'Lord of War': array([[0.45011637]], dtype=float32), 'The Dirty Dozen': array([[0.449897]], dtype=float32), 'Zulu': array([[0.44650054]], dtype=float32), 'Knives Out': array([[0.44500747]], dtype=float32), 'Ladri 

In [None]:
# Example evaluation using cosine similarity
vector1 = np.array([0.1, 0.2, 0.3])
vector2 = np.array([0.2, 0.3, 0.4])
similarity_score = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
print("Cosine similarity:", similarity_score)

Cosine similarity: 0.99258333397093
