Install chromadb package

In [None]:
!pip install chromadb==0.4.24

In [1]:
#import default embedding model from chromadb
from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [2]:
#create vector store and create collections
import chromadb
client = chromadb.PersistentClient(path="./rag/collections_1")
collection = client.get_or_create_collection(name="pirls_2021", embedding_function=default_ef)

⚠️ It looks like you upgraded from a version below 0.6 and could benefit from vacuuming your database. Run chromadb utils vacuum --help for more information.


In [7]:
#connect to s3 bucket and download files
import boto3
s3 = boto3.resource('s3')
response = s3.meta.client.get_object(Bucket="gdsc-bucket-058264313357", Key='web_data/PIRLS 2021– Findings.txt')
response_2 = s3.meta.client.get_object(Bucket="gdsc-bucket-058264313357", Key='web_data/What can we learn from PIRLS 2021.txt')

In [143]:
pirls_findings = response['Body'].read().decode("utf-8")
learn_from_pirls = response_2['Body'].read().decode("utf-8")

In [144]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


#Initialize Text Splitter from LangChain to Split Large Documents into Smaller Pieces 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False,
)

# Load document pirls_findings, split it to smaller chunks and load it to vector database
texts = text_splitter.create_documents([pirls_findings])
collection.add(
    documents=[text.page_content for text in texts],
    metadatas=[{'source': "PIRLS 2021– Findings.txt"} for x in range(0,len(texts))],
    ids = ["pirls_findings_"+str(x) for x in range(0,len(texts))]
)

In [145]:
# Load document learn_from_pirls, split it to smaller chunks and load it to vector database
texts = text_splitter.create_documents([learn_from_pirls])
collection.add(
    documents=[text.page_content for text in texts],
    metadatas=[{'source': "What can we learn from PIRLS 2021.txt"} for x in range(0,len(texts))],
    ids = ["learn_from_pirls_"+str(x) for x in range(0,len(texts))]
)

In [15]:
#Upload RAG collection to s3 for access by multiple models, allowing a single store to be shared across models
import glob
source_path = 'rag/collections_1'
files = [x for x in glob.glob(source_path +'/**', recursive=True) if "." in x]
for file in files:
    s3.meta.client.upload_file(file, "gdsc-bucket-058264313357", file)