## Proyecto de busqueda

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from ast import literal_eval

------------------------------------

## Cargando base

In [2]:
path_data = 'G:/Mi unidad/Cursos/Platzi/Embeddings/data/25k-imdb-movie-dataset.csv'
df = pd.read_csv(path_data)
print(df.shape)

(24402, 12)


In [3]:
df.sample(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
14865,Fruits Basket: Prelude,1 hour 28 minutes,7.6,177,"['Animation', 'Drama', 'Romance']",A compilation film of the Fruits Basket reboot...,['anime animation'],Yoshihide Ibata,"['Taku Kishimoto', 'Yoshimasa Hosoya', 'Manaka...",Natsuki Takaya,-2022,/title/tt15799550/
13585,Ruang talok 69,not-released,7.2,2.2K,"['Comedy', 'Crime', 'Thriller']","A woman, fired from a financial coorporation d...","['police officer', 'deafness', ""imagining one'...",Pen-Ek Ratanaruang,"['Tasanawalai Ongartittichai', 'Black Phomtong...",Pen-Ek Ratanaruang,-1999,/title/tt0235154/
20561,"Money, Women and Guns",not-released,6.2,356,"['Mystery', 'Western']",After a gold prospector is killed by masked ro...,"['reference to billy the kid', 'reference to p...",Richard Bartlett,"['Jock Mahoney', 'Kim Hunter', 'Tim Hovey', 'R...",Montgomery Pittman,-1958,/title/tt0051946/


In [4]:
df.isna().sum()/df.shape[0]

movie title     0.000000
Run Time        0.000000
Rating          0.000000
User Rating     0.000000
Generes         0.000000
Overview        0.009999
Plot Kyeword    0.000000
Director        0.000000
Top 5 Casts     0.000000
Writer          0.000000
year            0.031883
path            0.000000
dtype: float64

--------------------------------

## Limpieza de datos

In [5]:
def concatenar_lista(lista):
    lista = literal_eval(lista)
    return ' '.join(lista)

In [6]:
def string_to_list(lista):
    lista = literal_eval(lista)
    return lista

In [7]:
df = df.fillna(' ')
df['Keywords'] = df['Plot Kyeword'].apply(concatenar_lista)
df['Stars'] = df['Top 5 Casts'].apply(concatenar_lista)
df['Generes'] = df['Generes'].apply(string_to_list)
df['Rating'] = pd.to_numeric(df['Rating'], errors="coerce").fillna(0).astype("float")

In [8]:
unique_generes = df['Generes'].explode().unique()
unique_generes

array(['Action', 'Drama', 'Adventure', 'Sci-Fi', 'Animation', 'Crime',
       'Comedy', 'Thriller', 'Fantasy', 'Horror', 'History', 'Mystery',
       'Biography', 'War', 'Western', 'Sport', 'Family', 'Romance',
       'Music', 'Musical', 'Film-Noir', 'Game-Show', 'Adult',
       'Reality-TV'], dtype=object)

In [9]:
df.drop(['Plot Kyeword','Top 5 Casts'],axis=1, inplace=True)

In [10]:
df['text'] = df.apply(lambda x : str(x['Overview'])+' '+x['Keywords']+' '+x['Stars'], axis=1)

In [11]:
df.sample(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Director,Writer,year,path,Keywords,Stars,text
11393,"Mystery, Alaska",1 hour 59 minutes,6.7,27K,"[Comedy, Drama, Sport]",The residents of a small town get over-excited...,Jay Roach,David E. Kelley,-1999,/title/tt0134618/,small town alaska male nudity hockey movie usa...,Sean O'Byrne Russell Crowe Burt Reynolds Hank ...,The residents of a small town get over-excited...
19266,The Quatermass Xperiment,1 hour 18 minutes,6.6,6.3K,"[Horror, Sci-Fi]",Professor Bernard Quatermass' manned rocket sh...,Val Guest,Richard H. Landau,-1955,/title/tt0049646/,inspector assistant scotland yard fingerprint ...,Val Guest Nigel Kneale Brian Donlevy Jack Warn...,Professor Bernard Quatermass' manned rocket sh...
14364,Shergar,not-released,5.4,482,"[Crime, Drama, Romance]",When a young stable boy runs away from reform ...,Dennis C. Lewiston,Dennis C. Lewiston,-1999,/title/tt0121744/,horse movie horse sport champion terrorist ira...,Alan Barker Billy Boyle Stephen Brennan Dennis...,When a young stable boy runs away from reform ...


------------------------------------

## Embeddings model

In [12]:
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [16]:
df['length_text'] = df['text'].apply(lambda x: len(x))
df['length_text'].describe()

count    24402.000000
mean       341.046349
std         84.094446
min          8.000000
25%        295.000000
50%        345.000000
75%        394.000000
max        686.000000
Name: length_text, dtype: float64

In [13]:
embeddings = model.encode(df['text'],show_progress_bar=True)

Batches:   0%|          | 0/763 [00:00<?, ?it/s]

In [14]:
df['embeddings'] = embeddings.tolist()
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

In [15]:
df.sample(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Director,Writer,year,path,Keywords,Stars,text,embeddings,ids
23785,Monsieur Verdoux,2 hours 4 minutes,7.8,17K,"[Comedy, Crime, Drama]",A suave but cynical man supports his family by...,Charles Chaplin,Charles Chaplin,-1947,/title/tt0039631/,serial killer black comedy execution polygamy ...,Orson Welles Charles Chaplin Mady Correll Alli...,A suave but cynical man supports his family by...,"[-0.07911019027233124, 0.4310271739959717, -0....",23785
13448,Number Seventeen,1 hour 6 minutes,5.7,4.8K,"[Crime, Mystery, Thriller]",A gang of thieves gather at a safe house follo...,Alfred Hitchcock,Joseph Jefferson Farjeon,-1932,/title/tt0023285/,shadow based on play based on novel pretending...,Alma Reville Alfred Hitchcock Leon M. Lion Ann...,A gang of thieves gather at a safe house follo...,"[0.3045026659965515, 0.07984738051891327, -0.0...",13448
20539,The Last Hunt,1 hour 48 minutes,6.9,1.6K,"[Drama, Western]","In 1883 South Dakota, two buffalo hunters star...",Richard Brooks,Richard Brooks,-1956,/title/tt0049432/,south dakota native american reservation based...,Milton Lott Robert Taylor Stewart Granger Lloy...,"In 1883 South Dakota, two buffalo hunters star...","[0.1664801687002182, 0.15159517526626587, -0.1...",20539


In [None]:
# Guardar la base para no volver a crear los embeddings
df.to_csv('G:/Mi unidad/Cursos/Platzi/Embeddings/data/25k-imdb-movie-dataset-new.csv', sep='|', index=False)

In [None]:
path_data = 'G:/Mi unidad/Cursos/Platzi/Embeddings/data/25k-imdb-movie-dataset-new.csv'
df = pd.read_csv(path_data, sep='|')
print(df.shape)

(24402, 12)


In [None]:
df.sample(3)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
14865,Fruits Basket: Prelude,1 hour 28 minutes,7.6,177,"['Animation', 'Drama', 'Romance']",A compilation film of the Fruits Basket reboot...,['anime animation'],Yoshihide Ibata,"['Taku Kishimoto', 'Yoshimasa Hosoya', 'Manaka...",Natsuki Takaya,-2022,/title/tt15799550/
13585,Ruang talok 69,not-released,7.2,2.2K,"['Comedy', 'Crime', 'Thriller']","A woman, fired from a financial coorporation d...","['police officer', 'deafness', ""imagining one'...",Pen-Ek Ratanaruang,"['Tasanawalai Ongartittichai', 'Black Phomtong...",Pen-Ek Ratanaruang,-1999,/title/tt0235154/
20561,"Money, Women and Guns",not-released,6.2,356,"['Mystery', 'Western']",After a gold prospector is killed by masked ro...,"['reference to billy the kid', 'reference to p...",Richard Bartlett,"['Jock Mahoney', 'Kim Hunter', 'Tim Hovey', 'R...",Montgomery Pittman,-1958,/title/tt0051946/


In [None]:
from langchain.vectorstores import Chroma

In [None]:
pincone_api = getpass('Enter the secret value: ')

In [None]:
pinecone.init(api_key=pincone_api, environment="asia-southeast1-gcp-free")

In [None]:
dimensions_embeddings = len(df['embeddings'][0])
index_name = 'movies-embeddings'
all_index = pinecone.list_indexes()
if index_name in all_index:
    index = pinecone.Index(index_name)
else:
    pinecone.create_index(index_name, dimension=dimensions_embeddings, metric="cosine")
    index = pinecone.Index(index_name)

In [None]:
index = pinecone.Index('movies-embeddings')

In [None]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df[i:i_end]
    # generate embeddings for batch
    ids = batch['ids']
    emb = batch['embeddings']
    metadata = batch.drop(['ids','embeddings','text','path'],axis=1).to_dict('records')

    # add all to upsert list
    to_upsert = list(zip(ids, emb,metadata))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

In [None]:
query = 'a history of time travel and science'
query_vector = model.encode(query).tolist()

responses = index.query(
  vector=query_vector,
  top_k=3,
  include_metadata=True,
  filter ={
  "Generes": { "$in": ['Action'] }
}
)

In [None]:
responses

In [None]:
def search(query, genre, rating,top_k):
    query_vector = model.encode(query).tolist()

    if rating:
        filter_rating = rating
    else:
        filter_rating = 0

    if genre:
         conditions ={
                "Generes": { "$in": [genre] },
                "Rating": { "$gte": filter_rating }
                }
    else:
        conditions ={
                "Rating": { "$gte": filter_rating },
                }

    responses = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True,
        filter=conditions
    )

    # Format the responses for better display
    response_data = []
    for response in responses['matches']:
        response_data.append({
            'Title': response['metadata']['movie title'],
            'Overview': response['metadata']['Overview'],
            'Director': response['metadata']['Director'],
            'Genre': response['metadata']['Generes'],
            'year': response['metadata']['year'],
            'Rating': response['metadata']['Rating'],
            'Score': response['score'],
        })

    df = pd.DataFrame(response_data)
    return df



In [None]:
import gradio as gr

# Define possible genres
genres = unique_generes.tolist()
iface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(lines=5, placeholder="Escribe aquí tu consulta...", label="Consulta"),
        gr.Dropdown(choices=genres, label="Género de la película"),
        gr.Slider(minimum=1, maximum=10, value=5, label="Puntuación mínima"),
        gr.Number(minimum=1, maximum=10, value=3, label="Número de resultados")

    ],
    outputs=gr.Dataframe(type="pandas", label="Resultados"),
    title="Buscador de películas",
    description="Introduce tu consulta, selecciona un género y define una puntuación mínima para buscar películas.",
)

# Launch the interface
iface.launch()
