In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re


In [7]:
# Load the Netflix dataset
netflix_data = pd.read_csv("E:/honeywell assignment/netflix_titles.csv")

# Select relevant columns for the recommendation system
netflix_data = netflix_data[['title', 'description', 'rating', 'duration', 'listed_in']]

# Handle missing values by removing rows with NA descriptions
netflix_data = netflix_data.dropna(subset=['description'])

# Preprocess the description: Convert to lowercase and remove punctuation
netflix_data['clean_description'] = netflix_data['description'].str.lower()+' ' +netflix_data['listed_in'].str.lower()
netflix_data['clean_description'] = netflix_data['clean_description'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [13]:
netflix_data

Unnamed: 0,title,description,rating,duration,listed_in,clean_description
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",PG-13,90 min,Documentaries,as her father nears the end of his life filmma...
1,Blood & Water,"After crossing paths at a party, a Cape Town t...",TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",after crossing paths at a party a cape town te...
2,Ganglands,To protect his family from a powerful drug lor...,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",to protect his family from a powerful drug lor...
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",TV-MA,1 Season,"Docuseries, Reality TV",feuds flirtations and toilet talk go down amon...
4,Kota Factory,In a city of coaching centers known to train I...,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",in a city of coaching centers known to train i...
...,...,...,...,...,...,...
8802,Zodiac,"A political cartoonist, a crime reporter and a...",R,158 min,"Cult Movies, Dramas, Thrillers",a political cartoonist a crime reporter and a ...
8803,Zombie Dumb,"While living alone in a spooky town, a young g...",TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies",while living alone in a spooky town a young gi...
8804,Zombieland,Looking to survive in a world taken over by zo...,R,88 min,"Comedies, Horror Movies",looking to survive in a world taken over by zo...
8805,Zoom,"Dragged from civilian life, a former superhero...",PG,88 min,"Children & Family Movies, Comedies",dragged from civilian life a former superhero ...


In [14]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to recommend movies based on description similarity using sentence transformers
def recommend_based_on_description(movie_title, data, model, num_recommendations=5):
    # Find the index of the movie
    movie_index = data.index[data['title'] == movie_title].tolist()
    
    if not movie_index:
        raise ValueError("Movie not found!")
    
    movie_index = movie_index[0]

    # Generate embeddings for all movie descriptions
    embeddings = model.encode(data['clean_description'].tolist(), show_progress_bar=True)
    
    # Calculate cosine similarity between the target movie and all others
    cosine_sim = cosine_similarity([embeddings[movie_index]], embeddings).flatten()
    
    # Get the indices of the most similar movies (excluding the target movie itself)
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:num_recommendations+1]  # Exclude the first result (the movie itself)
    
    # Get the indices of the recommended movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the titles of the recommended movies
    return data['title'].iloc[movie_indices].tolist(),sim_scores



In [15]:

# Example: Get recommendations for a specific movie
movie_title = "The Irishman"
description_recommendations,_ = recommend_based_on_description(movie_title, netflix_data, model, 5)



Batches: 100%|██████████| 276/276 [01:04<00:00,  4.25it/s]


In [16]:
# Print recommendations
print(f"Movies similar to {movie_title}:")
for movie in description_recommendations:
    print(movie)

Movies similar to The Irishman:
The Irishman
Pulp Fiction
All Day and a Night
Küçük Esnaf
The Blue Elephant 2
RattleSnake - The Ahanna Story


***Using Tfidf vectoriser***

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def recommend_based_on_description(movie_title, data, num_recommendations=5):
    movie_index = data.index[data['title'] == movie_title].tolist()
    
    if not movie_index:
        raise ValueError("Movie not found!")
    
    movie_index = movie_index[0]
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(data['clean_description'])
    cosine_sim = cosine_similarity(tfidf_matrix[movie_index], tfidf_matrix).flatten()
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:num_recommendations+1]  # Exclude the first result (the movie itself)
    
    # Get the indices of the recommended movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the titles of the recommended movies
    return data['title'].iloc[movie_indices].tolist()

# Example: Get recommendations for a specific movie
movie_title = "The Irishman"
description_recommendations = recommend_based_on_description(movie_title, netflix_data, 5)

# Print recommendations
print(f"Movies similar to {movie_title}:")
for movie in description_recommendations:
    print(movie)


Movies similar to The Irishman:
The Irishman
Rolling Thunder Revue: A Bob Dylan Story by Martin Scorsese
All Day and a Night
Why We Fight: The Battle of Russia
Catch Me If You Can
Pretend It’s a City


**Using KNN for Reccomendation**


In [17]:
from sklearn.neighbors import NearestNeighbors
model = SentenceTransformer('all-MiniLM-L6-v2')
def recommend_with_knn(movie_title, data, model, num_recommendations=5):
    movie_index = data.index[data['title'] == movie_title].tolist()
    if not movie_index:
        raise ValueError("Movie not found!")
    movie_index = movie_index[0]
    embeddings = model.encode(data['clean_description'].tolist(), show_progress_bar=True)
    knn = NearestNeighbors(n_neighbors=num_recommendations + 1, metric='cosine')
    knn.fit(embeddings)
    distances, indices = knn.kneighbors([embeddings[movie_index]])
    recommended_indices = indices.flatten()[1:]  # Skip the first result (the movie itself)
    recommended_distances = distances.flatten()[1:]

    return data['title'].iloc[recommended_indices].tolist(), recommended_distances




In [19]:
# Example: Get recommendations for a specific movie
movie_title = "The Irishman"
description_recommendations,_ = recommend_with_knn(movie_title, netflix_data,model, 5)

# Print recommendations
print(f"Movies similar to {movie_title}:")
for movie in description_recommendations:
    print(movie)


Batches: 100%|██████████| 276/276 [01:09<00:00,  3.97it/s]

Movies similar to The Irishman:
Pulp Fiction
All Day and a Night
Küçük Esnaf
The Blue Elephant 2
RattleSnake - The Ahanna Story



