<a href="https://colab.research.google.com/github/Korsholm22/M4_Group_Assignments/blob/main/Group_Assignment_3/Group_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [3]:
# Pip installs
!pip install sentence-transformers -q
!pip install gradio -q

In [4]:
# Imports
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from sentence_transformers.util import cos_sim
import gradio as gr
from tqdm import tqdm

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
# Importing the dataset
df_netflix = pd.read_csv('https://raw.githubusercontent.com/Korsholm22/M4_Group_Assignments/main/Group_Assignment_3/Data/netflix_titles.csv')

In [6]:
# Examining the dataset
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Preprocessing and Feature Engineering

In [7]:
# Checking the dataset for NaN values
df_netflix.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [8]:
# Merging title, listed_in and description into one column to include more information in the semantic search
df_netflix['information'] = df_netflix['listed_in'].str.cat(df_netflix['description'], sep =". ")
df_netflix['information'] = df_netflix['title'].str.cat(df_netflix['information'], sep =". ")

# Checking if the merge is succesfull
df_netflix['information'][1]

'Blood & Water. International TV Shows, TV Dramas, TV Mysteries. After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.'

In [9]:
# Creating 5 search examples to examine performance of the model
search_examples = ["Action movie taking place in space",
                    "Sad movie where the dog dies",  
                    "Documentray about turtles and plastic straws",       
                    "Funny movie with Kevin Hart and the Rock",    
                    "Dramatic true crime story"]

In [10]:
# Embedding the search examples
embeddings = model.encode(search_examples)

embeddings.shape

(5, 768)

In [11]:
# Converting the column show_information to a list
show_information = df_netflix['information'].values.tolist()

In [12]:
# Checking that the conversion is succesfull
show_information[0:5]

['Dick Johnson Is Dead. Documentaries. As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.',
 'Blood & Water. International TV Shows, TV Dramas, TV Mysteries. After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.',
 'Ganglands. Crime TV Shows, International TV Shows, TV Action & Adventure. To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.',
 'Jailbirds New Orleans. Docuseries, Reality TV. Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.',
 'Kota Factory. International TV Shows, Romantic TV Shows, TV Comedies. In a city of coaching centers known to train India’s finest collegiate minds, an earnest but u

In [13]:
corpus_embeddings = model.encode(show_information, convert_to_tensor=True)

# Modelling

In [14]:
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(show_information))
for query in search_examples:
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(show_information[idx], "(Score: {:.4f})".format(score))





Query: Action movie taking place in space

Top 5 most similar sentences in corpus:
Star Trek: Deep Space Nine. TV Action & Adventure, TV Sci-Fi & Fantasy. In this "Star Trek" spin-off, Commander Sisko leads the multi-species crew of Deep Space Nine, a Federation space station with a complex mission. (Score: 0.7565)
Nova: Ultimate Mars Challenge. Movies. With access to the scientists and engineers responsible for the Curiosity rover's on-the-ground experiments, NOVA captures its landing on Mars. (Score: 0.7558)
The Search for Life in Space. Documentaries. To determine whether we're alone in the universe, astrobiologists look to Jupiter, Mars and, closer to home, extreme environments on Earth. (Score: 0.7439)
Mobile Suit Gundam III: Encounters in Space. Action & Adventure, Anime Features, International Movies. The Earth Federation prepares to take the war into the Duchy of Zeon's home territory. Veteran pilot Amuro Ray returns to space for the final battle. (Score: 0.7385)
A Year In 

In [15]:
# Converting columns from the dataset to lists to be used as output in the semantic search function
title = df_netflix['title'].values.tolist()
description = df_netflix['description'].values.tolist()

In [16]:
# Defining the semantic search function to return Title, description and cosinus score
def query_corpus(query):
  results = []
  query_embedding = model.encode(query, convert_to_tensor=True)
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)

  for score, idx in zip(top_results[0], top_results[1]):
    results.append(
        (title[idx], 
         description[idx],
         score))
    Title = "Title: " + results[0][0]
    desc = "Description: " + results[0][1]
    score = "Cosinus Score: " + str(results[0][2].numpy())  
  return Title, desc, score

In [23]:
# Testing a search term to examine the performance of the model
query_corpus('True crime tv show about brutal murderers')

('Title: Inside the Mind of a Serial Killer',
 "Description: Mixing dramatic re-enactments with real-life footage, this series delves into the tormented psyches of the world's most infamous serial killers.",
 'Cosinus Score: 0.882298')

In [28]:
# Testing a search term to examine the performance of the model
query_corpus('Happy story about dogs, that will make me stop crying')

('Title: Pup Star',
 "Description: After a singing pup with big dreams of stardom gets dognapped and escapes with a friend's help, her journey home is a fun, music-filled adventure.",
 'Cosinus Score: 0.6289302')

# Gradio

In [24]:
# Setting up gradio

gr.Interface(fn=query_corpus, inputs=["text"], outputs=["text", "text", "label"]).launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

