# This notebook is intended to create the initial knowledgebase for pinecone. We shall be having another page for expanding the knowledgebase


In [1]:
import pandas as pd


In [2]:

df = pd.read_excel("Goederen code knowledge base.xlsx")

In [3]:
df.columns

Index(['Item Name', 'Goederen Omschrijving', 'Goederen Code (HS Code)'], dtype='object')

In [4]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [5]:
df

Unnamed: 0,Item Name,Goederen Omschrijving,Goederen Code (HS Code)
0,Engine Valve Cover Gasket,AUTO PAKKING,84841010
1,SG3 Hide and Seek Keychain Pendant,SPEELGOED,95030000
2,Cross Necklace with Secret Compartment,ONEDELE LIJFSIERADEN,71171910
3,Plush Stuffed Figure Pillow,SPEELGOED,95030000
4,Radiator Coolant Water Hose,FITTING,73249000
...,...,...,...
527,Full diamond mx grips bicycle handles,FIETSDELEN,87149110
528,Maymom Pump Parts Compatible with Spectra S1 S...,POMP DELEN,84138100
529,Replacement Backflow Protector Tubing Compatib...,STOFZUIGERS DELEN,85086000
530,Puffy 3d paint,ART ARTIKLES,97019090


In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [7]:
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))  

  embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))


In [8]:
import getpass
import os
# Pinecone client
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [9]:
index_name = "ship2aruba" 
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [10]:
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)


In [11]:
from langchain_core.documents import Document
# How pinecone expects each chunk to be
docs = [Document(page_content="The description of this item is:"+ row['Item Name'] + "/nThe description of the goods is:"+row["Goederen Omschrijving"],metadata={"desc":row["Item Name"],"code":row["Goederen Code (HS Code)"],"gdesc":row["Goederen Omschrijving"],"category":"NaN"}) for index, row in df.iterrows()]


In [12]:
docs # 

[Document(metadata={'desc': 'Engine Valve Cover Gasket', 'code': 84841010, 'gdesc': 'AUTO PAKKING', 'category': 'NaN'}, page_content='The description of this item is:Engine Valve Cover Gasket/nThe description of the goods is:AUTO PAKKING'),
 Document(metadata={'desc': 'SG3 Hide and Seek Keychain Pendant', 'code': 95030000, 'gdesc': 'SPEELGOED', 'category': 'NaN'}, page_content='The description of this item is:SG3 Hide and Seek Keychain Pendant/nThe description of the goods is:SPEELGOED'),
 Document(metadata={'desc': 'Cross Necklace with Secret Compartment', 'code': 71171910, 'gdesc': 'ONEDELE LIJFSIERADEN', 'category': 'NaN'}, page_content='The description of this item is:Cross Necklace with Secret Compartment/nThe description of the goods is:ONEDELE LIJFSIERADEN'),
 Document(metadata={'desc': 'Plush Stuffed Figure Pillow', 'code': 95030000, 'gdesc': 'SPEELGOED', 'category': 'NaN'}, page_content='The description of this item is:Plush Stuffed Figure Pillow/nThe description of the goods 

In [13]:
# Now add all of the docs in the pinceone namespace
uuids = [str(doc.metadata["desc"]).encode("ascii","ignore").decode() for doc in ((docs))]
batch_size = 10
for i in range(0, len(docs), batch_size):
    batch = docs[i:i+batch_size]
    batch_ids = uuids[i:i+batch_size]
    print("Current Batch Index is:",batch_ids)
    vector_store.add_documents(batch,ids=batch_ids)

Current Batch Index is: ['Engine Valve Cover Gasket', 'SG3 Hide and Seek Keychain Pendant', 'Cross Necklace with Secret Compartment', 'Plush Stuffed Figure Pillow', 'Radiator Coolant Water Hose', 'Id Lanyard', 'Boys Uniform Quick Dry Chino Shorts', 'Maternity Underwear', "Women's Socks & Ankle Socks", "Women's Shoes"]
Current Batch Index is: ["Women's Slim Fit T-Shirt", 'Universal Remote Controls', 'Casio Digital Watch', "Automatic Men's Watch", "100 Rock 'N' Roll Hits 4CD", 'FINIS Forearm Fulcrum Swim Trainer', "Fizik Men's Biking Shoes (2 pairs)", "Various Women's Dresses & Jumpsuits", 'Cubic Zirconia Heart Ring', "Women's Headscarf/Turban"]
Current Batch Index is: ["Women's Bikinis", "Tween Girls' Clothing Sets", 'OLED Display for iPhone 13', 'Car Windshield Sun Shade', 'Reflective Polarizer Film', 'Korean Skincare Ampoule, Toner Pads, Cream, Toner', 'Curling Iron Wand', 'Hair Color', 'Eyelash Extensions', 'Hair Crimper Waver']
Current Batch Index is: ['Dog Wart Remover', 'NuBest Ta

In [14]:
vector_store.similarity_search(query="Something used for freshening",k=20)

[Document(id='Chemical Guys New Car Smell Freshener', metadata={'category': 'NaN', 'code': 33073000.0, 'desc': 'Chemical Guys New Car Smell Freshener', 'gdesc': 'LUCHTVERFRISSER'}, page_content='The description of this item is:Chemical Guys New Car Smell Freshener/nThe description of the goods is:LUCHTVERFRISSER'),
 Document(id='Chemical Guys Cherry Blast Freshener', metadata={'category': 'NaN', 'code': 33073000.0, 'desc': 'Chemical Guys Cherry Blast Freshener', 'gdesc': 'LUCHTVERFRISSER'}, page_content='The description of this item is:Chemical Guys Cherry Blast Freshener/nThe description of the goods is:LUCHTVERFRISSER'),
 Document(id='LITTLE TREES Car Air Freshener (Watermelon)', metadata={'category': 'NaN', 'code': 33073000.0, 'desc': 'LITTLE TREES Car Air Freshener (Watermelon)', 'gdesc': 'LUCHTVERFRISSER'}, page_content='The description of this item is:LITTLE TREES Car Air Freshener (Watermelon)/nThe description of the goods is:LUCHTVERFRISSER'),
 Document(id='Little Trees Bubble 