In [1]:
# imports and constants
import openai
import tiktoken
import os
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.high_level import extract_text


import json
import requests
from tqdm.notebook import tqdm  # for printing progress bars
import numpy as np
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField,
    NumericField
)
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from IPython.display import clear_output, display, Markdown
import time
from datetime import datetime, date


openai.api_key = "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"
GPT_MODEL = "gpt-4"

INDEX_NAME = "SangforWP"
VECTOR_DIM = 1536 
DISTANCE_METRIC = "COSINE"                # distance metric for the vectors (ex. COSINE, IP, L2)

r = Redis() #Initialize Redis client with default settings



None


In [None]:
data_dir = os.path.join(os.curdir, 'WhitePapers')
pdf_files = sorted([x for x in os.listdir(data_dir) 
                   if x != '.DS_Store' and os.path.isfile(os.path.join(data_dir, x))])
pdf_files

# Use pdfminer to extract text from the PDF
full_path = os.path.join(data_dir, pdf_files[0])
with open(full_path, 'rb') as f:
    number_of_pages = len(list(PDFPage.get_pages(f)))
pdf_content = {}
for n in range(number_of_pages):
    text = extract_text(full_path, page_numbers=[n])
    pdf_content[n + 1] = text
pdf_content

In [None]:
# Create search index

# define RediSearch vector fields to use FLAT index
page_embedding = VectorField("page_embedding",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC
    }
)

# Define RediSearch fields for each of the columns in the dataset
# This is where you should add any additional metadata you want to capture
page_num = NumericField("page_num", sortable=True)
content = TextField("content")

feilds = [page_num, content, page_embedding]
indexDefinition = IndexDefinition(prefix=[INDEX_NAME], index_type=IndexType.HASH)

try:
    r.ft(INDEX_NAME).create_index(fields=feilds, definition=indexDefinition)
except Exception as e:
    print(e)
print(r.ft(INDEX_NAME).info())

for item_key, item_value in pdf_content.items():
    # Create embedding with GPT(ada)
    page_embedding = openai.Embedding.create(input=item_value, model="text-embedding-ada-002")["data"][0]["embedding"]
    # Prepare embedding vector for RediSearch
    page_embedding = np.array(page_embedding).astype(np.float32).tobytes()
    key = f"{INDEX_NAME}:Reliability:{item_key}"
    r.hset(key, mapping={'page_num': item_key, 'content': item_value, 'page_embedding': page_embedding})