## ETL

Create a dataframe with movies for netflix, disney and amazon prime, with the following columns: id, title, genres, vote_average, vote_count, stream_platform

In [32]:
import pandas as pd

movies_df = pd.read_csv('../../data/netflix_titles.csv')
print(movies_df.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [33]:
def representation_df(row):
    textual_representation = f"Title: {row['title']}, Director: {row['director']}, Cast: {row['cast']},  Genres: {row['listed_in']}, Description: {row['description']}"
    return textual_representation

movies_df['textual_representation'] = movies_df.apply(representation_df, axis=1)
print(movies_df.head())    

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [34]:
movies_df["textual_representation"].values[0]

'Title: Dick Johnson Is Dead, Director: Kirsten Johnson, Cast: nan,  Genres: Documentaries, Description: As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [35]:
import faiss
from langchain_openai import OpenAIEmbeddings
import numpy as np
import getpass
import os

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    
dim= 1536
index = faiss.IndexFlatL2(dim)  

X = np.zeros((len(movies_df), dim) , dtype="float32")

X   

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [36]:
#probar api formato de embedding
res = OpenAIEmbeddings().embed_query(movies_df["textual_representation"].values[0])
print(res[0:5])
print(len(res))
print(type(res))

faiss.write_index(index, "index.faiss")
index = faiss.read_index("index.faiss")


[-0.008000205270946026, 0.004284423775970936, -0.009954503737390041, -0.006016494240611792, -0.0016340288566425443]
1536
<class 'list'>


In [37]:
movies_df = movies_df.tail(dim)

for i, text in enumerate(movies_df['textual_representation']):
    if i % 200 == 0:
        print(f"Processed {i} movies")
    try:
        res = OpenAIEmbeddings().embed_query(text)
        X[i] = np.array(res)
    except Exception as e:
        print(f"Error processing movie at index {i}: {e}")
index.add(X)

Processed 0 movies
Processed 200 movies
Processed 400 movies
Processed 600 movies
Processed 800 movies
Processed 1000 movies
Processed 1200 movies
Processed 1400 movies


In [42]:
faiss.write_index(index, "index2.faiss")

In [43]:
index = faiss.read_index("index2.faiss")

## INPUT


In [44]:
favorite_movie = "Title: Spider-Man, Director: Sam Raimi, Cast: Tobey Maguire, Kirsten Dunst, Willem Dafoe, Genres: Action, Adventure, Sci-Fi, Description: After being bitten by a genetically altered spider, nerdy high school student Peter Parker is endowed with amazing powers to become the Amazing superhero known as Spider-Man."

query_vector = OpenAIEmbeddings().embed_query(favorite_movie)

D, I = index.search(np.array([query_vector]), k=5)

print(D)
print(I)



[[0.09888822 0.21962631 0.29690358 0.30280223 0.3083499 ]]
[[ 796  797 1534  144 1309]]


# find movie

In [45]:
best_matches = np.array(movies_df['textual_representation'])[I.flatten()]
print(best_matches)

['Title: Spider-Man 3, Director: Sam Raimi, Cast: Tobey Maguire, Kirsten Dunst, James Franco, Thomas Haden Church, Topher Grace, Bryce Dallas Howard, Rosemary Harris, J.K. Simmons, James Cromwell, Theresa Russell, Dylan Baker, Bill Nunn, Bruce Campbell, Elizabeth Banks,  Genres: Action & Adventure, Sci-Fi & Fantasy, Description: The seemingly invincible Spider-Man goes up against an all-new crop of villains in the third installment of the blockbuster adventure series.'
 'Title: Spider-Man: Into the Spider-Verse, Director: Peter Ramsey, Rodney Rothman, Bob Persichetti, Cast: Shameik Moore, Jake Johnson, Hailee Steinfeld, Brian Tyree Henry, Lauren Vélez, Mahershala Ali, Lily Tomlin, Kathryn Hahn, Liev Schreiber, Kimiko Glenn, Nicolas Cage, John Mulaney,  Genres: Action & Adventure, Comedies, Description: After being bitten by a radioactive spider, Brooklyn teen Miles Morales gets a crash course in web-slinging from his alternate-dimension counterparts.'
 'Title: Zoom, Director: Peter Hew

In [48]:
#Intentarlo con troya
favorite_movie = "Tittle: Troy, Director: Wolfgang Petersen, Cast: Brad Pitt, Eric Bana, Orlando Bloom, Genres: Action, Adventure, Drama, History, Description: Achilles and his lover, the Trojan prince Hector, face off against each other in a battle for supremacy."
query_vector = OpenAIEmbeddings().embed_query(favorite_movie)
D, I = index.search(np.array([query_vector]), k=5)
print(D)
print(I)
best_matches = np.array(movies_df['textual_representation'])[I.flatten()]
print(best_matches)


[[0.18550126 0.3099234  0.31138295 0.31188726 0.31564403]]
[[1363  610 1309  480 1423]]
["Title: Troy: The Odyssey, Director: Tekin Girgin, Cast: Dylan Vox, Lara Heller, Hachem Hicham, David Gray, Kelly B. Jones, Daniel Whyte, Eoin O'Brien, Ego Mikitas,  Genres: Action & Adventure, Description: Starting with the Trojan Horse attack, this modest re-telling of Homer's epic covers well-known highlights of Odysseus's long journey home."
 'Title: Rocky III, Director: Sylvester Stallone, Cast: Sylvester Stallone, Talia Shire, Burt Young, Carl Weathers, Burgess Meredith, Tony Burton, Mr. T, Hulk Hogan, Ian Fried,  Genres: Dramas, Sports Movies, Description: After taking a pounding from a powerful young fighter, humbled world champ Rocky Balboa turns to ex-rival Apollo Creed for help in regaining his form.'
 'Title: Thor: Ragnarok, Director: Taika Waititi, Cast: Chris Hemsworth, Tom Hiddleston, Cate Blanchett, Idris Elba, Jeff Goldblum, Tessa Thompson, Karl Urban, Mark Ruffalo, Anthony Hopkins