# Importing Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import shutil
import sys
sys.path.append("../source/")
sys.path.append("../")

import paths
import webscraper

from dotenv import load_dotenv

load_dotenv("../.env")

import pinecone
from langchain.llms import Replicate
from langchain.vectorstores import Pinecone
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain

REPLICATE_API_TOKEN = os.environ["REPLICATE_API_TOKEN"]
PINECONE_API_TOKEN = os.environ["PINECONE_API_TOKEN"]

pinecone.init(api_key = PINECONE_API_TOKEN, environment = "gcp-starter")

  from tqdm.autonotebook import tqdm


# Populating the Transformed Data Directory with pdf files

In [8]:
webscraper.ScrapeData(StartYear = 2023, EndYear = 2023)

# Populating the Vector Database

In [21]:
#Fetching all .pdf Files in the data/transformed/ Directory
#Then for each one of them we Load, Split and Generate Embeddings
#Ultimately we Push the Embeddings Generated to the Vector Database (aka VectorStore)

for x in os.listdir(paths.TRANSFORMED_DATA_DIR):
    if x.endswith(".pdf"):
        FilePath = str(paths.TRANSFORMED_DATA_DIR / x)
        Loader = PyPDFLoader(FilePath)
        Document = Loader.load()
        
        TextSplitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
        Text = TextSplitter.split_documents(Document)
        Embeddings = HuggingFaceEmbeddings() #(model_name = "sentence-transformers/all-MiniLM-L6-v2")
        
        #Loading Documents into the Vector Database

        IndexName = "centralbanksllm"
        Index = pinecone.Index(IndexName)
        VectorDB = Pinecone.from_documents(Text, Embeddings, index_name = IndexName)
        
        #After Loading the Documents' Embeddings to the Vector Store, the File is Moved to a Storage Directory
        
        shutil.move(FilePath, str(paths.STORAGE_DATA_DIR / x))