In [None]:
### VIDEO TUTORIAL DEL NOTEBOOK: https://www.youtube.com/watch?v=MqbPVdLu0oU&t=68s

### Initialize Libraries and Keys

In [1]:
# Install libraries into kernel 
%pip install pinecone-client
%pip install tqdm
%pip install openai
%pip install langchain
%pip install tiktoken

Collecting pinecone-client
  Downloading pinecone_client-3.1.0-py3-none-any.whl.metadata (14 kB)
Downloading pinecone_client-3.1.0-py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.0/211.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pinecone-client
Successfully installed pinecone-client-3.1.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Base Python data handling environment imports 
import pandas as pd
import os
from tqdm.auto import tqdm
import time

# Pinecone is a cloud-based Vector Database we'll use 
# to store embeddings
import pinecone

# OpenAI is used for the embedding LLM and GenAI model 
# used to generate responses
import openai

# Langchain is middleware that ties together the components 
# of the embedding and retrieval pipelines 

# The embedding chain creates searchable vectors of our data
from langchain.embeddings.openai import OpenAIEmbeddings

# A link in the chain to operate a chat session
from langchain.chat_models import ChatOpenAI

# We'll maintain some memory of the chat so follow-up questions
# will be context-sensitive
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

### Get Environment Variables

When using VSCode, install the dotenv extension and create an .env file with these contents:

OPENAI_KEY=YOUR_OPENAI_API_KEY

PINECONE_KEY=YOUR_PINECONE_KEY

In [2]:
# Hay que crearse una cuenta de pinecone para la database (dan 100eur gratis)

OPENAI_KEY=""
openai.api_key = OPENAI_KEY
EMBEDDING_MODEL="text-embedding-ada-002"
GENAI_MODEL='gpt-3.5-turbo'

PINECONE_KEY=""
PINECONE_ENV="gcp-starter"
PINECONE_INDEX_NAME="default" # this will be created below

### Read Input Data

In [37]:
# cleaned datos es un csv con en cada fila el texto de cada html. Hay que limpiarlo mas

df = pd.read_csv("AI AGENT.csv")
df.columns=["barrio", "distrito", "datos"]
df.insert(0, 'ID', range(1, 1 + len(df))) # primera columna con ids
df['ID'] = df['ID'].astype(str) # se ponen como strings
df.dropna() # elimninar valores a null

df.head()

Unnamed: 0,ID,barrio,distrito,datos
0,1,Madrid Centro,Palacio\r,"{\n \""comprar\"": [\n \""Precio de compra de pis..."
1,2,Madrid Centro,Sol,"{\n \""comprar\"": \""Precio de compra de piso..."
2,3,Arganzuela,Delicia,"{\n \""comprar\"": [\n \""Precio de com..."
3,4,Ciudad Lineal,Costillares,"{\n \""comprar\"": [\n \""Precio de com..."
4,5,Madrid Centro,embajadores-lavapies,"{\n \""comprar\"": \""Precio de compra de piso..."


In [None]:
df.shape

In [42]:
filtered_df = df.astype(str)   # Opción de filtrar o mejorar mas el dataframe

### Create the Pinecone Vector Database if does not exist

In [10]:
# Corregido para que funcione con la nueva api de pinecone
from pinecone import Pinecone

pinecone = Pinecone(api_key=PINECONE_KEY, environment = PINECONE_ENV)
index_list = pinecone.list_indexes()
if len(index_list) == 0:
    pinecone.create_index(
      name=PINECONE_INDEX_NAME,
      dimension=1536,
      metric="cosine",
      spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
      )
    )

print(pinecone.describe_index(PINECONE_INDEX_NAME))
index = pinecone.Index(PINECONE_INDEX_NAME)

{'dimension': 1536,
 'host': 'default-b06ers8.svc.apw5-4e34-81fa.pinecone.io',
 'metric': 'cosine',
 'name': 'default',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
 'status': {'ready': True, 'state': 'Ready'}}


### Generate Embedding Index

In [11]:
# This references the text-embedding-ada-002 OpenAI model we'll use to create embeddings 
# Both for indexing ground knowledge content, and later when searching ground knowledge
# For RAG documents to include in LLM Prompts

embed = OpenAIEmbeddings(
    model = EMBEDDING_MODEL,
    openai_api_key= OPENAI_KEY)

  warn_deprecated(


In [None]:
# en nuestro csv las columnas se llaman barrio, distrito y datos. 

# This is a for loop to create embeddings for each of the articles, and 
# Then add the embeddings and orgiional article text to the vector databse
batch_size = 20 

for i in tqdm(range(0, len(filtered_df), batch_size)):
    # OpenAPI has rate limits, and we use batches to slow the pace of embedding requests
    i_end = min(i+batch_size, len(filtered_df))
    batch = filtered_df.iloc[i:i_end]
    
    # When querying the Vector DB for nearest vectors, the metadata 
    # is what is returned and added to the LLM Prompt (the "Grounding Knowledge")
    meta_data = [{"barrio" : row['barrio'], 
                  "distrito": row['distrito'],
              "datos": row['datos']} 
             for i, row in batch.iterrows()]
    
    # Get a list of documents to submit to OpenAI for embedding  
    docs = batch['datos'].tolist() 
    emb_vectors = embed.embed_documents(docs) 

    # The original ID keys are used as the PK in the Vector DB
    ids = batch['ID'].tolist()
    
    # Add embeddings, associated metadata, and the keys to the vector DB
    to_upsert = zip(ids, emb_vectors, meta_data)    
    index.upsert(vectors=to_upsert)
    
    # Pause for 10 seconds after each batch to avoid rate limits
    time.sleep(10) 

### Submit a simple query to the Vector Index to ensure we it works!

In [44]:
from langchain.vectorstores import Pinecone
vectorstore = Pinecone(index, embed, "datos")
#preguntar algo de madrid
query = "Es caro el centro de madrid?" #ask some question that's answerable with the content added to the Vector DB
vectorstore.similarity_search(query, k=3)

[Document(page_content='The average price per square meter for buying a house in PAU de Carabanchel, Madrid Capital, as of February 2024, is approximately 3,369 €/m²1.\r\nThe average value of a property in this area is around 302,306 €1.\r\nThe price per square meter varies depending on the number of rooms:\r\nStudio or 1 room: 3,624 €/m²\r\n2 rooms: 3,353 €/m²\r\n3 rooms: 3,321 €/m²\r\nMore than 3 rooms: Price not available1.\r\nThe average value of a property also varies by size:\r\nLess than 100 m²: 266,273 €\r\nMore than 100 m²: 368,367 €1.\r\nThere are currently 17 properties for sale in the area1.', metadata={'barrio': 'Carabanchel', 'distrito': 'pau-de-carabanchel'}),
 Document(page_content='The average price per square meter for buying a house in Nueva España, Madrid Capital, as of February 2024, is approximately 6,474 €/m²1.\r\nThe average value of a property in this area is around 985,755 €1.\r\nThe price per square meter varies depending on the number of rooms:\r\nStudio or 

### Create a GPT 3.5 Turbo Chatbot with a 5 response memory

In [58]:
# Create a reference to the OpenAI LLM
llm = ChatOpenAI(openai_api_key = OPENAI_KEY,
                model_name = GENAI_MODEL,
                temperature = 0.0) # si se sube la temperatura es mas creativo (probarlo a 0.2)

# Ensure the chat session includes memory of 5 previous messages
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'history',
    k = 5,
    return_messages =True)

# Create the chain to manage the chat session
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())

### DEMO: Andrew's conversation with the chatbot

In [60]:
qa.run("Estoy buscando vivir en un bonito apartamento dentro del presupuesto de 1000€ en Madrid. ¿A qué barrio voy?")

'Dentro de tu presupuesto de 1000€ para vivir en un bonito apartamento en Madrid, te recomendaría buscar en los barrios de Puerta Bonita, Pueblo Nuevo, o Embajadores - Lavapiés, ya que los precios de la vivienda en esos barrios se encuentran dentro de esa gama de precios.'

In [61]:
qa.run("Es importante para mi tener al menos tres habitaciones")

'En la zona de Quintana, el precio medio por metro cuadrado para comprar una vivienda de 3 habitaciones es de 3.307 €/m² y para alquilar sería de 1.419 €/m². Mientras que en Ventas, el precio medio por metro cuadrado para comprar una vivienda de 3 habitaciones es de 3.522 €/m² y para alquilar sería de 1.473 €/m². En Buena Vista, el precio medio por metro cuadrado para comprar una vivienda de 3 habitaciones es de 2.389 €/m² y para alquilar sería de 998 €/m².'

In [62]:
qa.run("¿Cúanto cuesta comprar un piso en esa zona?")

'En la zona de Butarque, Madrid, el precio medio de compra por metro cuadrado es de 2.693 €/m², con un valor medio de un inmueble de 222.453 €. Sin embargo, ten en cuenta que estos datos son aproximados y pueden variar dependiendo de las características específicas de la vivienda.'

### Now have a conversation about the documents that were added to the grounding data vector database

In [29]:
qa.run("Is Sol an expensive area in Madrid?")

  warn_deprecated(


"Based on the information provided, Sol seems to be a relatively expensive area in Madrid. The average price per square meter for buying property in Sol is around €7,767, which is higher than some other areas mentioned in the data. However, without a direct comparison to other areas, it's difficult to determine if Sol is one of the most expensive areas in Madrid."

In [30]:
qa.run("How many rooms do flats have there?")

'In the provided information, the flats mentioned have different numbers of rooms. In Palacio, the most common number of rooms is 3. In Pilar, the most common number of rooms is also 3. In Trafalgar, the most common number of rooms is 3 for buying and 1 for renting. In Embajadores - Lavapiés, the most common number of rooms is 3.'