In [36]:
!pip install pinecone-client





In [23]:
import os
from dotenv import load_dotenv
load_dotenv('./Sn33k/.env')

True

In [43]:
# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT= os.getenv("PINECONE_ENVIRONMENT")


In [46]:
import pinecone

pinecone.init(
	api_key=PINECONE_API_KEY,
	environment=PINECONE_ENVIRONMENT
)
index = pinecone.Index(INDEX_NAME)
index


<pinecone.index.Index at 0x24cbbcf8b20>

In [47]:
import openai

openai.api_key = OPENAI_API_KEY
# get API key from top-right dropdown on OpenAI website

openai.Engine.list()  # check we have authenticated


<OpenAIObject list at 0x24cbbd14590> JSON: {
  "object": "list",
  "data": [
    {
      "object": "engine",
      "id": "whisper-1",
      "ready": true,
      "owner": "openai-internal",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "babbage",
      "ready": true,
      "owner": "openai",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "text-davinci-003",
      "ready": true,
      "owner": "openai-internal",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "davinci",
      "ready": true,
      "owner": "openai",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "text-davinci-edit-001",
      "ready": true,
      "owner": "openai",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "babbage-code-search-code",
      "ready": true,
  

In [29]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        """The Great Depression (1929–1939) was an economic shock that impacted most countries across the world. It was a period of economic depression that became evident after a major fall in stock prices in the United States. The economic contagion began around September 1929 and led to the Wall Street stock market crash of October 24 (Black Thursday). It was the longest, deepest, and most widespread depression of the 20th century.""",
        """Between 1929 and 1932, worldwide gross domestic product (GDP) fell by an estimated 15%. By comparison, worldwide GDP fell by less than 1% from 2008 to 2009 during the Great Recession. Some economies started to recover by the mid-1930s. However, in many countries,[specify] the negative effects of the Great Depression lasted until the beginning of World War II. Devastating effects were seen in both rich and poor countries with falling personal income, prices, tax revenues, and profits. International trade fell by more than 50%, unemployment in the U.S. rose to 23% and in some countries rose as high as 33%.""",
        "Cities around the world were hit hard, especially those dependent on heavy industry. Construction was virtually halted in many countries. Farming communities and rural areas suffered as crop prices fell by about 60%. Faced with plummeting demand and few job alternatives, areas dependent on primary sector industries suffered the most.",
        "Economic historians usually consider the catalyst of the Great Depression to be the sudden devastating collapse of U.S. stock market prices, starting on October 24, 1929. However, some dispute this conclusion, seeing the stock crash less as a cause of the Depression and more as a symptom of the rising nervousness of investors partly due to gradual price declines caused by falling sales of consumer goods (as a result of overproduction because of new production techniques, falling exports and income inequality, among other factors) that had already been underway as part of a gradual Depression."
    ], engine=MODEL
)


In [None]:
# extract embeddings to a list
embeds = [record['embedding'] for record in res['data']]

In [7]:
from datasets import load_dataset

# load the first 1K rows of the TREC dataset
trec = load_dataset('trec', split='train[:1000]')


Downloading:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset trec/default (download: 350.79 KiB, generated: 403.39 KiB, post-processed: Unknown size, total: 754.18 KiB) to C:\Users\Soh Hong Yu\.cache\huggingface\datasets\trec\default\1.1.0\751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset trec downloaded and prepared to C:\Users\Soh Hong Yu\.cache\huggingface\datasets\trec\default\1.1.0\751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9. Subsequent calls will reuse this data.


In [8]:
from tqdm.auto import tqdm  # this is our progress bar

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/32 [00:00<?, ?it/s]

In [16]:
query = "What caused the 1929 Great Depression?"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']


In [17]:
res = index.query([xq], top_k=5, include_metadata=True)


In [18]:
res


{'matches': [{'id': '932',
              'metadata': {'text': 'Why did the world enter a global '
                                   'depression in 1929 ?'},
              'score': 0.918035,
              'values': []},
             {'id': '787',
              'metadata': {'text': "When was `` the Great Depression '' ?"},
              'score': 0.871160746,
              'values': []},
             {'id': '775',
              'metadata': {'text': 'What historical event happened in Dogtown '
                                   'in 1899 ?'},
              'score': 0.798837781,
              'values': []},
             {'id': '481',
              'metadata': {'text': 'What caused the Lynmouth floods ?'},
              'score': 0.792251,
              'values': []},
             {'id': '864',
              'metadata': {'text': 'When did the Dow first reach ?'},
              'score': 0.791675508,
              'values': []}],
 'namespace': ''}

In [11]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")


0.92: Why did the world enter a global depression in 1929 ?
0.87: When was `` the Great Depression '' ?
0.81: What crop failure caused the Irish Famine ?
0.80: What historical event happened in Dogtown in 1899 ?
0.79: What caused the Lynmouth floods ?


In [59]:
query = "<meta"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

In [60]:
res = index.query([xq], top_k=5, include_metadata=True)


In [62]:
res


{'matches': [{'id': '214', 'score': 0.801155746, 'values': []},
             {'id': '133', 'score': 0.799858332, 'values': []},
             {'id': '438', 'score': 0.795604944, 'values': []},
             {'id': '213', 'score': 0.794442654, 'values': []},
             {'id': '306', 'score': 0.793788075, 'values': []}],
 'namespace': ''}

In [52]:
for match in res['matches']:
  print(match)
    # print(f"{match['score']:.2f}: {match['metadata']['text']}")


{'id': '108', 'score': 0.762225866, 'values': []}
{'id': '413', 'score': 0.762186289, 'values': []}
{'id': '111', 'score': 0.757854939, 'values': []}
{'id': '416', 'score': 0.757763565, 'values': []}
{'id': '412', 'score': 0.757707357, 'values': []}


In [47]:
dir(index)


['_ApiClient__call_api',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_iter_dataframe',
 '_parse_non_empty_args',
 '_parse_sparse_values_arg',
 '_pool',
 '_upsert_batch',
 '_vector_api',
 'call_api',
 'close',
 'configuration',
 'cookie',
 'default_headers',
 'delete',
 'describe_index_stats',
 'deserialize',
 'fetch',
 'files_parameters',
 'get_file_data_and_close_file',
 'last_response',
 'parameters_to_multipart',
 'parameters_to_tuples',
 'pool',
 'pool_threads',
 'query',
 'request',
 'rest_client',
 'sanitize_for_serialization',
 'select_header_accept',
 'select_header_content_type',
 'set_default_header',
 'update',
 'update_params

In [54]:
pinecone.whoami()


WhoAmIResponse(username='1ff62de', user_label='webscrape', projectname='e0961eb')

In [55]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 460}},
 'total_vector_count': 460}

In [58]:
index.fetch(["3", "10"])


{'namespace': '',
 'vectors': {'10': {'id': '10',
                    'values': [0.00462727202,
                               -0.0105208131,
                               -0.0115176635,
                               -0.00930842757,
                               -0.0241803583,
                               0.0138346674,
                               -0.0107363481,
                               -0.00677588861,
                               -0.0140097896,
                               -0.0359943844,
                               0.0145082148,
                               0.0190479252,
                               -0.0337043218,
                               0.0124404235,
                               -0.00203074608,
                               0.0193577576,
                               0.0238705259,
                               -0.0313873179,
                               0.018441733,
                               -0.00297539658,
                               -0.

In [65]:
query = "Singlife"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']


In [67]:
res = index.query([xq], top_k=5, include_metadata=True)
res

{'matches': [{'id': '402', 'score': 0.836676478, 'values': []},
             {'id': '270', 'score': 0.836104572, 'values': []},
             {'id': '173', 'score': 0.824328244, 'values': []},
             {'id': '305', 'score': 0.824004233, 'values': []},
             {'id': '438', 'score': 0.815329671, 'values': []}],
 'namespace': ''}

In [68]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")


ApiAttributeError: ScoredVector has no attribute 'metadata' at ['['received_data', 'matches', 0]']['metadata']