In [3]:
import pickle
import sys
sys.setrecursionlimit(1500)
!pip install streamlit
!pip install sentence_transformers
!pip install chromadb
import streamlit as st
import pandas as pd
from PIL import Image
from sentence_transformers import SentenceTransformer
import chromadb

#Loaded the dataset
url = 'https://raw.githubusercontent.com/datum-oracle/netflix-movie-titles/main/titles.csv'
df = pd.read_csv(url)

#Inspected and combined the features in one single text column
print(df.head())
print(df.columns)

#Defined the 3 main feature columns
title_col = 'title'
genres_col = 'genres'
description_col = 'description'

#Checking if these columns are present in the dataset
columns_to_use = [title_col, genres_col, description_col]
for col in columns_to_use:
    if col not in df.columns:
        print(f"Warning: '{col}' column is not in the DataFrame")

#Combined the features in one single text column
df['text'] = df[title_col].astype(str)
if genres_col in df.columns:
    df['text'] += ' ' + df[genres_col].astype(str)
if description_col in df.columns:
    df['text'] += ' ' + df[description_col].astype(str)

#Cleaned and preprocess the text column
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace('[^\w\s]', '', regex=True)
df['text'] = df['text'].str.strip()

#Generating embeddings using Huggingfaces's pre-trained model all-Mini-L6-v2
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

#Initialized ChromaDB vector store and index embeddings
client = chromadb.Client()

#Checking if the collection exists or not using list_collections(), if not create one 'movie_titles'
collections = client.list_collections()
if any(collection.name == 'movie_titles' for collection in collections):
    client.delete_collection(name='movie_titles')

collection = client.create_collection(name='movie_titles')

#Adding the embeddings to ChromaDB vector store
df['id'] = df.index.astype(str)
collection.add(
    documents=df['text'].tolist(),
    embeddings=embeddings,
    ids=df['id'].tolist()
)

#Giving input query to test
query = "Romance"
query_embedding = model.encode([query])

#Performing the search in ChromaDB
results = collection.query(query_embeddings=query_embedding, n_results=5)

#Displaying the results
print("Top movie titles:")
for idx in results['ids'][0]:
    print(df.loc[df['id'] == idx, title_col].values[0])


         id                                title   type  \
0  ts300399  Five Came Back: The Reference Films   SHOW   
1   tm82169                                Rocky  MOVIE   
2   tm17823                               Grease  MOVIE   
3  tm191099                            The Sting  MOVIE   
4   tm69975                             Rocky II  MOVIE   

                                         description  release_year  \
0  This collection includes 12 World War II-era p...          1945   
1  When world heavyweight boxing champion, Apollo...          1976   
2  Australian good girl Sandy and greaser Danny f...          1978   
3  A novice con man teams up with an acknowledged...          1973   
4  After Rocky goes the distance with champ Apoll...          1979   

  age_certification  runtime                                 genres  \
0             TV-MA       51                      ['documentation']   
1                PG      119                     ['drama', 'sport']   
2          



Batches:   0%|          | 0/192 [00:00<?, ?it/s]

Top movie titles:
The Interest of Love
Nevertheless,
She Would Never Know
More the Merrier
Slay
