## ETL

Create a dataframe with movies for netflix, disney and amazon prime, with the following columns: id, title, genres, vote_average, vote_count, stream_platform

In [1]:
import pandas as pd

movies_df = pd.read_csv('../../data/netflix_titles.csv')
print(movies_df.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [2]:
def representation_df(row):
    textual_representation = f"Title: {row['title']}, Director: {row['director']}, Cast: {row['cast']},  Genres: {row['listed_in']}, Description: {row['description']}"
    return textual_representation

movies_df['textual_representation'] = movies_df.apply(representation_df, axis=1)
print(movies_df.head())    

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [3]:
movies_df["textual_representation"].values[0]

'Title: Dick Johnson Is Dead, Director: Kirsten Johnson, Cast: nan,  Genres: Documentaries, Description: As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [4]:
import faiss
from langchain_openai import OpenAIEmbeddings
import numpy as np
import getpass
import os

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    
dim= 1536
index = faiss.IndexFlatL2(dim)  

X = np.zeros((len(movies_df), dim) , dtype="float32")

X   

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [7]:
batch_size = 200
num_batches = (len(movies_df) + batch_size - 1) // batch_size  # Calcula cuántos lotes se necesitan

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(movies_df))
    batch_df = movies_df[start_idx:end_idx]
    
    print(f"Processing batch {batch_idx + 1}/{num_batches}, movies {start_idx} to {end_idx - 1}")
    
    for i, text in enumerate(batch_df['textual_representation']):
        try:
            res = OpenAIEmbeddings().embed_query(text)
            X[start_idx + i] = np.array(res)
        except Exception as e:
            print(f"Error processing movie at index {start_idx + i}: {e}")

index.add(X)

Processing batch 1/45, movies 0 to 199


KeyboardInterrupt: 

In [None]:
faiss.write_index(index, "index.faiss")

In [None]:
index = faiss.read_index("index.faiss")