In [1]:
# Import libraries
import time
import tqdm
import glob
import os

import pandas as pd

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import JSONLoader

from datasets import load_dataset

import pinecone

from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load env variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')

# Setup Vector DB (Pinecone)

In [3]:
# Load env variables
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

In [4]:
index_name = 'rag-ml'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

#index = pinecone.GRPCIndex(index_name)
index = pinecone.Index(index_name)

In [5]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [6]:
# Use embedding model "text-embedding-ada-002" from openAI to create vector embeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002",
                               disallowed_special=())

# Load documents

In [None]:
# Load all paper content into a DataFrame
"""
df = pd.DataFrame(columns=["text"])

papers = glob.glob("../data/txt/content/*.txt")

for paper_path in papers:
    
    with open(paper_path, "r") as infile:
        paper = infile.read()

    df = pd.concat([df, pd.DataFrame([paper], columns=["text"])], ignore_index=True)"""

In [7]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("title")
    metadata["source"] = record.get("source")
    metadata["id"] = record.get("id")

    return metadata

# LOAD PAPER CONTENT
loader = JSONLoader(
    file_path=f"../data/{cat}papers_aug_clean.jsonl",
    jq_schema='.',
    content_key="content",
    metadata_func=metadata_func,
    json_lines=True)

docs = loader.load()

In [9]:
cat = "cs_CL"

In [10]:
# LOAD PAPER CONTENT
loader = JSONLoader(
    file_path=f"../data/{cat}_train.jsonl",
    jq_schema='.',
    content_key="text",
    metadata_func=metadata_func,
    json_lines=True)

docs = loader.load()

In [None]:
# CURRENTLY ONLY USING ABSTRACT SUMMARY TO INSERT AS EMBEDDING VECTORS IN DB
# This DataFrame structure is currently used to insert docs into Pinecone

#dataset = pd.read_json("../data/cs_CLpapers.jsonl", lines=True)
#dataset.dropna(subset=["summary"], inplace=True)

In [None]:
# Load dataset from Huggingface
documents = load_dataset("kieranschubert/arxiv_cs_CL_train", split="train")

# Insert docs into Pinecone

- Need to update code to allow/disallow special tokens

In [11]:
batch_size = 100

for i in tqdm.tqdm(range(0, len(docs), batch_size)):

    i_end = min(len(docs), i+batch_size)

    # get batch of data
    batch = docs[i:i_end]

    # get id for each record
    ids = [x.metadata["id"] for x in batch]
    
    # get text
    documents = [x.page_content for x in batch]

    # embed text
    embeds = embed_model.embed_documents(documents)

    # get metadata to store in Pinecone
    metadata = [
        {'text': x.page_content,
         'source': x.metadata['source'],
         'title': x.metadata['title']} for x in batch
    ]

    # insert into Pinecone
    try:
        index.upsert(vectors=zip(ids, embeds, metadata))
    except Exception as e:
        print("Batch no: {i}, Exception: {e}")


 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 531/594 [55:40<10:25:13, 595.45s/it]

Batch no: {i}, Exception: {e}


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 594/594 [58:30<00:00,  5.91s/it]


In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.59,
 'namespaces': {'': {'vector_count': 59000}},
 'total_vector_count': 59000}