In [2]:
%pip install -qU \
    langchain==0.0.354 \
    openai==1.6.1 \
    datasets==2.10.1 \
    pinecone-client==3.1.0 \
    tiktoken==0.5.2

Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
from langchain_openai import ChatOpenAI

os.environ["API_KEY"]="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"

chat = ChatOpenAI(
    openai_api_key=os.environ["API_KEY"],
    model='gpt-3.5-turbo'
)

In [12]:
from datasets import load_dataset

# Specify the path to your dataset folder
dataset_folder_path = r"C:\Users\mirut\OneDrive\Desktop\INT2\Miruthula's Datasets\Miruthula's Datasets\California_Rail_Stations.csv"

# Load the dataset
dataset = load_dataset(
    "csv",  # Assuming your dataset is in CSV format
    data_files=dataset_folder_path,
    split="train"
)


Found cached dataset csv (C:/Users/mirut/.cache/huggingface/datasets/csv/default-1a90c5bceca93ed0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [13]:
# Print the loaded dataset
print(dataset)

Dataset({
    features: ['X', 'Y', 'OBJECTID', 'LOCATION', 'STATION', 'CODE', 'ADDRESS', 'ZIP', 'PASS_OP', 'PASS_NETWO', 'COMM_OP', 'COMM_NETWO', 'BUS_ROUTES', 'TRANSIT', 'AIRPORT', 'STATION_TY', 'INTERMODAL', 'DIST', 'CO'],
    num_rows: 292
})


In [14]:
dataset[0]

{'X': -13569099.945,
 'Y': 4383544.0745,
 'OBJECTID': 1,
 'LOCATION': 'Parking Garage',
 'STATION': 'MONTEREY - Parking Garage',
 'CODE': '-',
 'ADDRESS': 'Tyler, between Del Monte & Franklin',
 'ZIP': 93940,
 'PASS_OP': ' ',
 'PASS_NETWO': ' ',
 'COMM_OP': ' ',
 'COMM_NETWO': ' ',
 'BUS_ROUTES': '55',
 'TRANSIT': ' ',
 'AIRPORT': ' ',
 'STATION_TY': 2,
 'INTERMODAL': 0,
 'DIST': 5,
 'CO': 'MON'}

In [16]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or "c8b8503f-7230-4bab-a4fa-57f638bfc7c6"


# configure client
pc = Pinecone(api_key=api_key)

In [17]:

#Creating a vector database

import time
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key="c8b8503f-7230-4bab-a4fa-57f638bfc7c6")

index_name = 'testcone'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

if index_name not in existing_indexes:
    pc.create_index(
        index_name,
        dimension=1536, 
        metric='euclidean',
        spec=PodSpec(environment="gcp-starter")
    )

    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pc.Index(index_name)
time.sleep(1)

index.describe_index_stats()



{'dimension': 1536,
 'index_fullness': 0.00449,
 'namespaces': {'': {'vector_count': 449}},
 'total_vector_count': 449}

In [18]:
import os
from langchain_openai import OpenAIEmbeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["API_KEY"])


In [19]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed_model.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

In [20]:

from tqdm.auto import tqdm  # for progress bar

data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['OBJECTID']}-{x['CODE']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['LOCATION'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['LOCATION'],  
         'source': x['STATION_TY'],
         'title': x['STATION']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

100%|██████████| 3/3 [00:24<00:00,  8.13s/it]


In [21]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00449,
 'namespaces': {'': {'vector_count': 449}},
 'total_vector_count': 449}

In [24]:

from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [25]:
query = "where is MONTEREY - parking Garage?"

vectorstore.similarity_search(query, k=3)

[Document(page_content='Parking Garage', metadata={'source': 2.0, 'title': 'MONTEREY - Parking Garage'}),
 Document(page_content='Parking Garage', metadata={'source': 2.0, 'title': 'MONTEREY - Parking Garage'}),
 Document(page_content='Monterey Marriott Hotel', metadata={'source': 2.0, 'title': 'MONTEREY-Marriott'})]

In [26]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [27]:
print(augment_prompt(query))

Using the contexts below, answer the query.

    Contexts:
    Parking Garage
Parking Garage
Monterey Marriott Hotel

    Query: where is MONTEREY - parking Garage?
