# Workshop: Using Cloud tools for Information Retrieval

In [1]:
!pip install chromadb



In [2]:
!pip install pinecone-client




## Objective:

Learn how to use two powerful vector databases, ChromaDB and Pinecone, for performing similarity searches with text embeddings. Vector databases are essential tools in the field of Information Retrieval (IR) and are widely used in various applications such as search engines, recommendation systems, and natural language processing (NLP).


In [15]:
import pandas as pd
import chromadb
import numpy as np
from pinecone import Pinecone, ServerlessSpec
from google.colab import drive
from gensim.models import KeyedVectors
import gensim.downloader as api

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Start API Key

In [5]:
pc = Pinecone(api_key="a9b0bbf3-6d88-493d-80aa-9f12c74e8a0a")

Create index called viernes300 with 300 dimensions, cosine metric, and aws serveless

In [9]:
pc.create_index(
    name="viernes300",
    dimension=300, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [10]:
index = pc.Index("viernes300")

Import data and load word2vec model

In [11]:
wine_df = pd.read_csv("/content/drive/MyDrive/bases_de_datos_para_colab/winemag-data_first150k.csv", delimiter=",")
wine_df

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
...,...,...,...,...,...,...,...,...,...,...,...
150925,150925,Italy,Many people feel Fiano represents southern Ita...,,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Feudi di San Gregorio
150926,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,91,27.0,Champagne,Champagne,,Champagne Blend,H.Germain
150927,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Terredora
150928,150928,France,"A perfect salmon shade, with scents of peaches...",Grand Brut Rosé,90,52.0,Champagne,Champagne,,Champagne Blend,Gosset


In [20]:
# Load pre-trained Word2Vec model
#word2vec_model = api.load('word2vec-google-news-300')
word2vec_model = KeyedVectors.load_word2vec_format("//content/drive/MyDrive/bases_de_datos_para_colab/GoogleNews-vectors-negative300.bin", binary=True)

Define corpus

In [21]:
corpus = wine_df[['Unnamed: 0','description']][:30]
corpus

Unnamed: 0.1,Unnamed: 0,description
0,0,This tremendous 100% varietal wine hails from ...
1,1,"Ripe aromas of fig, blackberry and cassis are ..."
2,2,Mac Watson honors the memory of a wine once ma...
3,3,"This spent 20 months in 30% new French oak, an..."
4,4,"This is the top wine from La Bégude, named aft..."
5,5,"Deep, dense and pure from the opening bell, th..."
6,6,Slightly gritty black-fruit aromas include a s...
7,7,Lush cedary black-fruit aromas are luxe and of...
8,8,This re-named vineyard was formerly bottled as...
9,9,The producer sources from two blocks of the vi...


Create function for making embedding with word2vec

In [22]:
def generate_word2vec_embeddings(texts):
    embeddings = []
    for text in texts:
        tokens = text.lower().split()
        word_vectors = [word2vec_model[word] for word in tokens if word in word2vec_model]
        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))
        else:
            embeddings.append(np.zeros(word2vec_model.vector_size))
    return np.array(embeddings)


word2vec_embeddings = generate_word2vec_embeddings(corpus['description'])
print("Word2Vec Embeddings:", word2vec_embeddings)
print("Word2Vec Shape:", word2vec_embeddings.shape)

Word2Vec Embeddings: [[ 1.97866447e-02  3.41472141e-02 -8.84628296e-03 ... -1.57333612e-02
   6.62658662e-02 -2.78472900e-02]
 [ 1.68609619e-03 -1.24740601e-03 -6.54935837e-04 ... -4.45375443e-02
   6.40835762e-02  3.22151184e-02]
 [-1.75819397e-02  6.40892386e-02  2.40856409e-02 ... -4.09250259e-02
   9.11022425e-02  1.76935196e-02]
 ...
 [ 2.24064998e-02  5.79735003e-02 -1.06399122e-04 ... -2.36428753e-02
   6.22100830e-02 -2.15536579e-02]
 [ 7.06324074e-03  3.76312993e-02  2.09319014e-02 ... -3.86342034e-02
   8.60798284e-02  3.15088741e-02]
 [ 2.73168087e-02  3.64656448e-02  5.79528809e-02 ... -7.14282990e-02
   1.07534885e-01  2.65576839e-02]]
Word2Vec Shape: (30, 300)


Create vector to Pinecone

In [25]:
vectors = [{'id': str(i), 'values': word2vec_embeddings[i]} for i in range(30)]

In [26]:
pd.DataFrame(vectors)

Unnamed: 0,id,values
0,0,"[0.019786645, 0.034147214, -0.008846283, 0.051..."
1,1,"[0.0016860962, -0.001247406, -0.00065493584, 0..."
2,2,"[-0.01758194, 0.06408924, 0.02408564, 0.108896..."
3,3,"[0.032550234, 0.04590465, 0.0056318804, 0.1015..."
4,4,"[0.021037823, 0.0036380924, 0.03481565, 0.0596..."
5,5,"[0.024773298, 0.03794842, -0.007517787, 0.0935..."
6,6,"[-0.013163249, -0.010037509, -0.005572695, 0.1..."
7,7,"[0.028636653, 0.0016968889, -0.0020921289, 0.1..."
8,8,"[0.033648174, 0.006037839, -0.031522624, 0.078..."
9,9,"[0.018751256, 0.038642913, 0.0055281697, 0.069..."


In [27]:
index.upsert(vectors=vectors, namespace='vectors')

{'upserted_count': 30}

In [28]:
print(index.describe_index_stats())

{'dimension': 300,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


Create a query and use embeddings with word2vec. Then get the answer

In [29]:
query = 'rose'
query_vector = generate_word2vec_embeddings([query])
pd.DataFrame(query_vector)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.094727,0.060791,-0.163086,-0.018311,0.123047,-0.337891,0.076172,-0.146484,0.289062,0.080566,...,0.044678,-0.014282,-0.326172,0.108887,-0.347656,-0.166992,-0.168945,-0.139648,0.259766,0.174805


In [30]:
index.query(
    namespace="vectors",
    vector=query_vector.tolist(),
    top_k=3,
    include_values=False
)

{'matches': [{'id': '26', 'score': 0.368666798, 'values': []},
             {'id': '25', 'score': 0.305214375, 'values': []},
             {'id': '1', 'score': 0.286543876, 'values': []}],
 'namespace': 'vectors',
 'usage': {'read_units': 5}}

In [31]:
wine_df[wine_df['Unnamed: 0'] == 27]['description']

Unnamed: 0,description
27,"Focused and dense, this intense wine captures ..."


## Chromadb

Start with a client, create collections, and ids with vectors

In [32]:
client = chromadb.Client()

In [33]:
collection = client.create_collection(name="viernes_collection")

In [34]:
ids = [str(i) for i in range(30)]
vectors = [word2vec_embeddings[i].tolist() for i in range(30)]

In [35]:
collection.add(ids=ids, embeddings=vectors)

Create a query and get the answer

In [36]:
query = 'rose'
query_vector = generate_word2vec_embeddings([query])[0].tolist()

Get 3 asnwers

In [39]:
answer = collection.query(query_embeddings=query_vector, n_results=3)
answer

{'ids': [['26', '25', '4']],
 'distances': [[10.757364273071289, 11.1366548538208, 11.362028121948242]],
 'metadatas': [[None, None, None]],
 'embeddings': None,
 'documents': [[None, None, None]],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [40]:
for dist,id in zip(answer['distances'][0], answer['ids'][0]):
   print('')
   print('id: ',id)
   print('texto: ',wine_df[wine_df['Unnamed: 0'] == int(id)]['description'].values[0])
   print('distancia: ',dist)


id:  26
texto:  Bergström has made a Shea designate since 2003, intent on showcasing a “pretty” style of the vineyard. Here are lovely aromatics, with grape jelly, rose petals and plum in the nose, and a complex run of red fruits in the mouth. Streaks of cola, brown sugar and more come up in a generous finish.
distancia:  10.757364273071289

id:  25
texto:  Yields were down in 2015, but intensity is up, giving this medium-bodied, silky wine the potential to drink well through at least 2025. Hickory smoke outlines white peach before ending in a long flurry of lime zest.
distancia:  11.1366548538208

id:  4
texto:  This is the top wine from La Bégude, named after the highest point in the vineyard at 1200 feet. It has structure, density and considerable acidity that is still calming down. With 18 months in wood, the wine has developing an extra richness and concentration. Produced by the Tari family, formerly of Château Giscours in Margaux, it is a wine made for aging. Drink from 2020.
d