# 🌲  Pinecone quickstart  

Author: J Visbeen  
Target: Practice  
Main source: https://github.com/pinecone-io/examples/blob/master/learn/search/hybrid-search/ecommerce-search/ecommerce-search.ipynb

In [100]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os
load_dotenv()

pinecone = Pinecone(api_key=os.getenv('PINECONE_API'))
index = pinecone.Index('amazonproducts')
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Create index only if not already exist. I found it easier to make one through the UI.

In [None]:
'''
pinecone.create_index(
    name='amazonproducts',
    dimension=8,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='GCP',
        region='us-central1-gcp'
    )
)
'''

Prep dataset

In [31]:
import pandas as pd

df_categories = (
    pd.read_csv('../amazon_categories.csv')
    .dropna()
    .reset_index()
)

df_products = (
    pd.read_csv('../amazon_products.csv')
    .dropna()
    .sample(1000000)
    .reset_index()
)
pd.read_excel('').dropna()
merged_df = pd.merge(df_products, df_categories, left_on='category_id', right_on='id', how='left')
merged_df.drop('category_id', axis=1)
merged_df['title']

0         mDesign Large Metal Farmhouse Storage Organize...
1         Vegan Premium Canvas Fanny Pack Diaper Bag: Wi...
2         100+PCS Technic Gears and Axles Compatible wit...
3         Girl's Graphic Print Long Sleeve Sweatshirt To...
4         Women Boho Handmade Natural Stone Crystal Leat...
                                ...                        
999995    Saucer Swing for Kids, 45 Inch Swing Chair, Ou...
999996    LED Strobe Beacon Lights 12V Amber Flashing Wa...
999997    Dust Free Hamster Bath Sand/Natural Cleansing ...
999998    Fast Wall Charger 5FT Charging Cable Cord fit ...
999999    to My Son Pocket Hug Token with Leather Keycha...
Name: title, Length: 1000000, dtype: object

Train BM25 encoder.

In [32]:
from pinecone_text.sparse import BM25Encoder

t_set = merged_df['title'].tolist()
bm25 = BM25Encoder()
bm25.fit(t_set)

  0%|          | 0/1000000 [00:00<?, ?it/s]

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x715e0ad76b90>

In [61]:
bm25.encode_documents(merged_df['title'].tolist()[0])

{'indices': [207125060,
  2177509083,
  3594301313,
  2654641506,
  1119423170,
  3337197764,
  1516167604,
  805888890,
  2357056077,
  1027044273,
  1579027372,
  693667937,
  2626449335,
  91242678,
  2361787631,
  3033147134,
  869237476,
  303109060,
  2784200036,
  1014127006,
  19522071,
  453650801,
  1043209972],
 'values': [0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.554981847820371,
  0.38406565826405037,
  0.554981847820371,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037,
  0.38406565826405037]}

Test sentence transformer

In [85]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [114]:
meta = merged_df[:1000:]
db_sample_set = meta['title'].tolist()
dense_vec = model.encode(db_sample_set[0])

In [87]:
meta.iloc[1]

index_x                                                        1209568
asin                                                        B0C762Z1F5
title                Vegan Premium Canvas Fanny Pack Diaper Bag: Wi...
imgUrl               https://m.media-amazon.com/images/I/61wzaz-+mj...
productURL                        https://www.amazon.com/dp/B0C762Z1F5
stars                                                              4.6
reviews                                                              0
price                                                             89.0
listPrice                                                          0.0
category_id                                                         36
isBestSeller                                                     False
boughtInLastMonth                                                    0
index_y                                                             35
id                                                                  36
catego

Encode data and upsert data

In [115]:
from tqdm.auto import tqdm

upserts = []
i = 0
for item in tqdm(range(0, len(db_sample_set))):
    upserts.append({
        'id': str(i),
        'sparse_values': bm25.encode_documents(db_sample_set[i]),
        'values': model.encode(db_sample_set[i]).tolist(),
        'metadata': meta.iloc[i].to_dict()
    })
    i+=1

index.upsert(upserts)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'upserted_count': 1000}

Query the data

In [118]:
query = 'Beautiful flowers for in the garden'

sparse = bm25.encode_queries(query)
dense = model.encode(query).tolist()

result = index.query(
    top_k=20,
    vector=dense,
    sparse_vector=sparse,
    include_metadata=True
)
result

{'matches': [{'id': '209',
              'metadata': {'asin': 'B096FFCYCJ',
                           'boughtInLastMonth': 400.0,
                           'category_id': 163.0,
                           'category_name': 'Bath Products',
                           'id': 163.0,
                           'imgUrl': 'https://m.media-amazon.com/images/I/71XV0KDDa-S._AC_UL320_.jpg',
                           'index_x': 388935.0,
                           'index_y': 149.0,
                           'isBestSeller': False,
                           'listPrice': 0.0,
                           'price': 19.99,
                           'productURL': 'https://www.amazon.com/dp/B096FFCYCJ',
                           'reviews': 0.0,
                           'stars': 4.6,
                           'title': 'Rustic Farmhouse Shower Curtain, Farm '
                                    'Teal Daisy Floral Flowers and Butterfly '
                                    'on Country Wooden Shower Cu

In [155]:
def hybrid_scale(dense_query, sparse_query, alpha: float):
    
    if alpha < 0 or alpha > 1:
        raise ValueError("aplha needs to be between 1 or 0")
    hsparse = {
        'indices': sparse_query['indices'],
        'values':  [v * (1.0 - alpha) for v in sparse_query['values']]
    }
    hdense = []
    for i in dense_query:
        for j in i:
            hdense.append(dense_query[i][j] * alpha)
    
    hdense = [v * alpha for v in dense_query]
    
    return hdense, hsparse

In [156]:
def hybrid_query(query: str, top_k: int, alpha: float):
    sparse_vec = bm25.encode_queries(query)
    dense_vec = model.encode([query]).tolist()

    dense_vec, sparse_vec = hybrid_scale(
        sparse_query=sparse_vec,
        dense_query=dense_vec,
        alpha=alpha)
    
    result = index.query(
        vector=dense_vec,
        sparse_vector=sparse_vec,
        top_k=top_k,
        include_metadata=True
    )

    return result

In [157]:
hybrid_query('Shooter games for kids', 10, 0.3)

TypeError: list indices must be integers or slices, not list