In [16]:
from langchain_openai import OpenAIEmbeddings
from dotenv import dotenv_values
import json
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_redis import RedisConfig, RedisVectorStore

# retrieving the env file
env_vars = dotenv_values(".env")

REDIS_URL = env_vars["REDIS_URL"]

# entering the creditials
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=env_vars["OPENAI_API_KEY"],
)

# defining the configuration
config = RedisConfig(
    index_name="documents",
    redis_url=REDIS_URL,
    metadata_schema=[
        {"name": "category", "type": "tag"},
    ],
)

vector_store = RedisVectorStore(embeddings, config=config)

# handling JSON files
def json_adder(filename, vector_store):

    # each JSON file has to have a category and a content

    # Loading the JSON file
    with open(filename, 'r') as f:
        json_data = json.load(f)

    # Extract texts and metadata from the JSON file
    json_texts = [item["content"] for item in json_data]
    json_metadata = [{"category": item["category"]} for item in json_data]

    # Add JSON data to Redis vector store
    vector_store.add_texts(json_texts, json_metadata)

# handling PDF files
def pdf_adder(filename, vector_store):

    # Load the PDF
    pdf_loader = PyPDFLoader(filename)
    pdf_docs = pdf_loader.load()

    # Split the PDF into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    pdf_splits = text_splitter.split_documents(pdf_docs)

    # Extract texts and metadata from PDF splits
    pdf_texts = [chunk.page_content for chunk in pdf_splits]
    pdf_metadata = [{"page_number": chunk.metadata.get("page", "unknown"), "category": "pdf"} for chunk in pdf_splits]

    vector_store.add_texts(pdf_texts, pdf_metadata)

json_location = "/home/krispy_noodles/vector_redis/assets/sit_data.json"

# Adding JSON files
# json_adder(json_location, vector_store)

json_adder("/home/krispy_noodles/vector_redis/assets/Graduate_Employment_Survey_2023.json", vector_store)
json_adder("/home/krispy_noodles/vector_redis/assets/CCR_data.json", vector_store)
json_adder("/home/krispy_noodles/vector_redis/assets/sit_data.json", vector_store)

# Add PDF chunks to Redis vector store
pdf_location = "/home/krispy_noodles/vector_redis/assets/Rules-and-Regulations-of-SIT-Learner-Behaviour-Final.pdf"
# pdf_adder(pdf_location, vector_store)

pdf_adder("/home/krispy_noodles/vector_redis/assets/SIT_handbook.pdf", vector_store)
pdf_adder("/home/krispy_noodles/vector_redis/assets/Rules-and-Regulations-of-SIT-Learner-Behaviour-Final.pdf", vector_store)

In [17]:
import redis

# retrieving the env file
env_vars = dotenv_values(".env")

REDIS_URL = env_vars["REDIS_URL"]

# r = redis.Redis(
#   host='redis-19030.c1.ap-southeast-1-1.ec2.redns.redis-cloud.com',
#   port=19030,
#   password=env_vars["REDIS_PW"])

r = redis.Redis(
  host='redis-10327.c84.us-east-1-2.ec2.redns.redis-cloud.com',
  port=10327,
  password=env_vars["REDIS_PW"])

# converting the vector store into a yaml file
yml_file_name = "redis_schema.yaml"

vector_store.index.schema.to_yaml(yml_file_name)

# Read the YAML file contents
with open(yml_file_name, 'r') as f:
    schema_yaml = f.read()

# Store the YAML schema in Redis
r.set("vector_store_schema", schema_yaml)

True

In [18]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})
reply = retriever.invoke("Why join SIT?")

reply

[Document(metadata={'category': 'General FAQ'}, page_content="Question: What is unique about SIT? Answer: As Singapore's first university of applied learning, we develop individuals and innovate with industry to impact the economy and society in meaningful ways. And we do this through our unique applied learning pedagogy. Read more about our latest successes and happenings on our [newsroom](https://www.singaporetech.edu.sg/news) and learn more about the SIT experience on the [Why SIT page](https://www.singaporetech.edu.sg/why-sit)."),
 Document(metadata={'category': 'pdf'}, page_content='We believe that learning should not be limited to the \nclassroom. At SIT, we encourage you to venture out of \nyour comfort zone and brave new challenges to deepen \nand broaden your learning journey. \nFor students pursuing SIT -conferred and joint degree \nprogrammes, you will get a chance to participate in \ncarefully designed programmes such as the Overseas \nExposure Programme (OEP)1, Internation

In [None]:
# # clearing redis database

# import redis

# r = redis.Redis(
#   host='redis-10327.c84.us-east-1-2.ec2.redns.redis-cloud.com',
#   port=10327,
#   password='6CmQl65vgBCv53HRDE35WVCxraEzzPsM')

# # Delete all keys in db
# r.flushdb()

True