In [1]:
!pip install \
  tiktoken==0.4.0 \
  openai==0.27.7 \
  langchain==0.0.179 \
  pinecone-client \
  datasets==2.13.1



In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../scraper/data.csv')

In [3]:
df.head()

Unnamed: 0,item_name,text,url
0,Refrigerator Ice and Water Filter,...,https://www.partselect.com//PS11701542-Whirlpo...
1,Refrigerator Door Shelf Bin,...,https://www.partselect.com//PS12364199-Frigida...
2,Refrigerator BELT DRIVE,...,https://www.partselect.com//PS16542496-GE-WE03...
3,Refrigerator Door Shelf Bin,...,https://www.partselect.com//PS11752778-Whirlpo...
4,Refrigerator Oven Bake Element,...,https://www.partselect.com//PS438018-Frigidair...


In [4]:
import tiktoken

tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer = tiktoken.get_encoding(tokenizer_name.name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [6]:
from typing_extensions import Concatenate
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for i, page in tqdm(df.iterrows()):
    if len(page['text']) < 200:
        # if page content is short we can skip
        continue
    texts = text_splitter.split_text(page['text'])
    chunks.extend([{
        'id': page['item_name'] + f'-{i}',
        'text': texts[i],
        'url': page['url'],
        'chunk': i
    } for i in range(len(texts))])
len(chunks)

  from .autonotebook import tqdm as notebook_tqdm
20it [00:00, 34.99it/s]


551

In [7]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()

# get API key from top-right dropdown on OpenAI website
openai.api_key = os.getenv("OPENAI_API_KEY") or "OPENAI_API_KEY"

openai.Engine.list()  # check we have authenticated

<OpenAIObject list at 0x15ffd5b50> JSON: {
  "data": [
    {
      "created": null,
      "id": "gpt-3.5-turbo-0301",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "gpt-3.5-turbo-16k-0613",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "dall-e-2",
      "object": "engine",
      "owner": "system",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "gpt-3.5-turbo-16k",
      "object": "engine",
      "owner": "openai-internal",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "whisper-1",
      "object": "engine",
      "owner": "openai-internal",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-embedding-ada-002",
      "object": "engine",
      "own

In [8]:
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [10]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or "PINECONE_API_KEY"
# find your environment next to the api key in pinecone console
# env = os.getenv("PINECONE_ENVIRONMENT") or "PINECONE_ENVIRONMENT"
pc = Pinecone(api_key=api_key)
pinecone = pc

In [11]:
index_name = 'gpt-4-part-data'

In [12]:
import time
from pinecone import ServerlessSpec

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes().names():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-west-2'
        )
    )
    # wait for index to be initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [13]:
from tqdm.auto import tqdm

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
        done = False
        while not done:
            time.sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'url': x['url']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:08<00:00, 11.41s/it]


In [25]:
query = "How can I install part number PS11752778?"

res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(vector=xq, top_k=5, include_metadata=True)

In [26]:
res

{'matches': [{'id': 'Dishwasher Rack Track Stop-3',
              'metadata': {'chunk': 3.0,
                           'text': 'PartSelect site in U.S.  Would you like to '
                                   'shop on the Canadian site?      Stay on '
                                   'this site     Go to Canadian '
                                   'site                       Install '
                                   'videos!             See part 7 in the '
                                   'diagram               ( Grid squares '
                                   'measure 1x1 inch '
                                   ')                          Get in touch, '
                                   "we're here to help!     1-888-738-4871  "
                                   'Open until 12am EST  '
                                   'customerservice@partselect.com      Rack '
                                   'Track Stop WP8565925       ★★★★★  ★★★★★   '
                           

In [16]:
# get list of retrieved text
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [17]:
print(augmented_query)

PartSelect site in U.S.  Would you like to shop on the Canadian site?      Stay on this site     Go to Canadian site                       Install videos!             See part 7 in the diagram               ( Grid squares measure 1x1 inch )                          Get in touch, we're here to help!     1-888-738-4871  Open until 12am EST  customerservice@partselect.com      Rack Track Stop WP8565925       ★★★★★  ★★★★★   114 Reviews         Rated by 41 customers          Really Easy     30 - 60 mins      Ratings submitted by customers like you who bought this part.           $  10.49     In Stock      1  2  3  4  5  6  7  8  9  10+    Add to cart               Get this part fast! Average delivery time for in-stock parts via standard shipping: 1.8 days.       PartSelect Number PS11746591  Manufacturer Part Number WP8565925  Manufactured by  Whirlpool      Product Description  Part Videos  Troubleshooting  Customer Reviews

---

". Here's a guide for finding your model number .    Questio

In [22]:
# system message to 'prime' the model
primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)

In [23]:
from IPython.display import Markdown

display(Markdown(res['choices'][0]['message']['content']))

I don't have enough information to answer your question. Could you please provide more details about the part PS11752778 and the specific appliance it is for?