## Proyecto de busqueda

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from ast import literal_eval

In [2]:
# Global vars
path = 'G:/Mi unidad/Cursos/Platzi/Embeddings/'

------------------------------------

## Cargando base

In [None]:
df = pd.read_csv(path+'data/25k-imdb-movie-dataset.csv')
print(df.shape)

In [None]:
df.sample(3)

In [None]:
df.isna().sum()/df.shape[0]

--------------------------------

## Limpieza de datos

In [None]:
def concatenar_lista(lista):
    lista = literal_eval(lista)
    return ' '.join(lista)

In [None]:
def string_to_list(lista):
    lista = literal_eval(lista)
    return lista

In [None]:
df = df.fillna(' ')
df['Keywords'] = df['Plot Kyeword'].apply(concatenar_lista)
df['Stars'] = df['Top 5 Casts'].apply(concatenar_lista)
df['Generes'] = df['Generes'].apply(string_to_list)
df['Rating'] = pd.to_numeric(df['Rating'], errors="coerce").fillna(0).astype("float")

In [None]:
unique_generes = df['Generes'].explode().unique()
unique_generes

In [None]:
df.drop(['Plot Kyeword','Top 5 Casts'],axis=1, inplace=True)

In [None]:
df['text'] = df.apply(lambda x : str(x['Overview'])+' '+x['Keywords']+' '+x['Stars'], axis=1)

In [None]:
df.sample(3)

------------------------------------

## Embeddings model

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
df['length_text'] = df['text'].apply(lambda x: len(x))
df['length_text'].describe()

In [None]:
embeddings = model.encode(df['text'],show_progress_bar=True)

In [None]:
df['embeddings'] = embeddings.tolist()
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

In [None]:
df.sample(3)

In [None]:
# Guardar la base para no volver a crear los embeddings
df.to_csv(path+'data/25k-imdb-movie-dataset-new.csv', sep='|', index=False)

In [11]:
df = pd.read_csv(path+'data/25k-imdb-movie-dataset-new.csv', sep='|')
print(df.shape)

(24402, 16)


In [13]:
df['movie title'].unique().tolist()

['Top Gun: Maverick',
 'Jurassic World Dominion',
 'Top Gun',
 'Lightyear',
 'Spiderhead',
 'Everything Everywhere All at Once',
 'Interceptor',
 'Black Adam',
 'The Northman',
 'Thor: Love and Thunder',
 'Fantastic Beasts: The Secrets of Dumbledore',
 'The Unbearable Weight of Massive Talent',
 'Jurassic Park',
 'Doctor Strange in the Multiverse of Madness',
 'No Time to Die',
 'The Batman',
 'Bullet Train',
 'Jurassic World',
 'Jurassic World: Fallen Kingdom',
 'Vikram',
 'Morbius',
 'The Lost City',
 'Spider-Man: No Way Home',
 'Last Seen Alive',
 'Sonic the Hedgehog 2',
 'Uncharted',
 'Prey',
 'Ambulance',
 'The Lost World: Jurassic Park',
 'Dune',
 'Jurassic Park III',
 'Star Wars',
 'The Gray Man',
 'Avengers: Endgame',
 'The Lady of Heaven',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Eternals',
 'Free Guy',
 'The Matrix Resurrections',
 'Mission: Impossible',
 'Memory',
 'Star Wars: Episode I - The Phantom Menace',
 'The Adam Project',
 "The King's Man",
 'The 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24402 entries, 0 to 24401
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie title  24402 non-null  object 
 1   Run Time     24402 non-null  object 
 2   Rating       24402 non-null  float64
 3   User Rating  24402 non-null  object 
 4   Generes      24402 non-null  object 
 5   Overview     24402 non-null  object 
 6   Director     24402 non-null  object 
 7   Writer       24402 non-null  object 
 8   year         24402 non-null  object 
 9   path         24402 non-null  object 
 10  Keywords     22706 non-null  object 
 11  Stars        24401 non-null  object 
 12  text         24402 non-null  object 
 13  length_text  24402 non-null  int64  
 14  embeddings   24402 non-null  object 
 15  ids          24402 non-null  int64  
dtypes: float64(1), int64(2), object(13)
memory usage: 3.0+ MB


In [8]:
df.sample(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Director,Writer,year,path,Keywords,Stars,text,length_text,embeddings,ids
13981,The Supergrass,not-released,6.3,611,"['Comedy', 'Crime']",While taking a holiday in the country with his...,Peter Richardson,Peter Richardson,-1985,/title/tt0090102/,female nudity vomiting alternative comedy male...,Pete Richens Adrian Edmondson Jennifer Saunder...,While taking a holiday in the country with his...,509,"[-0.05452210456132889, 0.054123055189847946, -...",13981
3835,Superchick,not-released,3.7,575,"['Action', 'Comedy']",Tara B. True is a flight attendant who makes a...,Ed Forsyth,Gary Crutcher,-1973,/title/tt0070752/,grindhouse film sex in a bathroom erotic 70s f...,John H. Burrows Joyce Jillson Louis Quinn Thom...,Tara B. True is a flight attendant who makes a...,521,"[-0.06153341010212898, -0.0787108764052391, -0...",3835
18756,She's Out of Control,1 hour 30 minutes,5.3,4.4K,['Comedy'],When an average-looking teenage girl gets a ma...,Stan Dragoti,Seth Winston,-1989,/title/tt0098308/,makeover teenage girl beach paranoia therapist...,Michael J. Nathanson Tony Danza Catherine Hick...,When an average-looking teenage girl gets a ma...,333,"[-0.07038451731204987, -0.006437822710722685, ...",18756


--------------------------------------

## Indice chroma

In [None]:
import chromadb
from chromadb.utils import embedding_functions

In [None]:
df_sub = df.iloc[0:5461]
df_sub.shape

In [None]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name = 'all-MiniLM-L6-v2')

chroma_client = chromadb.Client()
client_persistent = chromadb.PersistentClient(path=path+'index/data_embeddings_movie')
db = client_persistent.create_collection(name='movies_db', embedding_function=sentence_transformer_ef)

In [None]:
db.add(
    ids=df_sub['ids'].tolist(),
    embeddings=df_sub['embeddings'].tolist(),
    metadatas= df_sub[['movie title', 'Stars']].to_dict('records')
)

In [None]:

client_persistent_2 = chromadb.PersistentClient(path=path+'index/data_embeddings_movie')

In [None]:
db_2 = client_persistent_2.get_collection('movies_db')