In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import re

# Sample dataset: Movie titles and their plot descriptions
data = {
    'title': [
        'The Matrix',
        'Inception',
        'Interstellar',
        'The Dark Knight',
        'The Avengers'
    ],
    'description': [
        'A computer hacker learns about the true nature of his reality and his role in the war against its controllers.',
        'A thief who steals corporate secrets through dream-sharing technology is given the inverse task of planting an idea into a CEO\'s mind.',
        'A team of explorers travel through a wormhole in space in an attempt to ensure humanity\'s survival.',
        'When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham.',
        'Earth\'s mightiest heroes must come together and learn to fight as a team to stop the mischievous Loki and his alien army from enslaving humanity.'
    ]
}

# Creating a DataFrame
df = pd.DataFrame(data)

# Step 1: Simple tokenization function
def tokenize(text):
    # Remove punctuation and split by whitespace
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.split()

# Tokenize the descriptions
df['tokens'] = df['description'].apply(tokenize)

# Step 2: Train a Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to compute the average word vector for a given list of words
def get_average_word_vector(words):
    word_vectors = []
    for word in words:
        if word in word2vec_model.wv:
            word_vectors.append(word2vec_model.wv[word])
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(word2vec_model.vector_size)

# Step 3: Compute average word vectors for each movie
df['avg_vector'] = df['tokens'].apply(get_average_word_vector)

# Step 4: Compute cosine similarity between movies
def compute_cosine_similarity():
    vectors = np.array(df['avg_vector'].tolist())
    return cosine_similarity(vectors)

cosine_sim = compute_cosine_similarity()

# Function to recommend the top movie based on a given movie title
def recommend_top_movie(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the index of the most similar movie (excluding the movie itself)
    most_similar_index = sim_scores[1][0]
    
    # Return the top most similar movie title
    return df['title'].iloc[most_similar_index]

# Example: Recommend the top movie similar to "Inception"
recommended_movie = recommend_top_movie('Inception')
print("Top movie recommended for 'Inception':")
print(recommended_movie)


Top movie recommended for 'Inception':
Interstellar
