<a href="https://colab.research.google.com/github/MatthewHsu1/RAGsystemOpenAI/blob/main/Open_AI_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets pandas openai pymongo pypdf langchain_community

In [252]:
from langchain_community.document_loaders import PyPDFLoader

# The url for the user manual pdf.
url = "https://raw.githubusercontent.com/MatthewHsu1/RAGsystemOpenAI/main/F80_F83_F85_OM_810.pdf"

# Initialize PyPDFLoader with downloaded user manual pdf
loader = PyPDFLoader(url)
pages = loader.load_and_split()

In [261]:
# First create a dictionary of headers to parse the pdf by headers.

headers = [
    "TABLE OF CONTENTS",
    "PRODUCT REGISTRATION",
    "IMPORTANT SAFETY INSTRUCTIONS",
    "IMPORTANT ELECTRICAL INSTRUCTIONS",
    "GROUNDING INSTRUCTIONS",
    "IMPORTANT OPERATION INSTRUCTIONS",
    "IMPORTANT SAFETY INSTRUCTIONS",
    "IMPORTANT SAFETY INSTRUCTIONS",
    "PREVENTATIVE MAINTENANCE CHART",
    "F80 / F83 / F83 ASSEMBLY PACK CHECKLIST",
    "F80 / F83 / F83 ASSEMBLY INSTRUCTIONS",
    "FOLDING INSTRUCTIONS",
    "TRANSPORTATION INSTRUCTIONS",
    "OPERATION OF YOUR TREADMILL",
    "GETTING STARTED",
    "QUICK-START OPERATION",
    "PAUSE/STOP/RESET FEATURE",
    "INCLINE FEATURE",
    "DOT MATRIX CENTER DISPLAY",
    "PROGRAMMABLE FEATURES",
    "HEART RATE PROGRAMS",
    "USING HEART RATE TRANSMITTER",
    "GENERAL MAINTENANCE",
    "BELT ADJUSTMENTS",
    "TREAD-BELT TRACKING ADJUSTMENT",
    "BELT/DECK LUBRICATION",
    "SERVICE CHECKLIST - DIAGNOSIS GUIDE",
    "MANUFACTURER’S LIMITED WARRANTY"
]

In [254]:
import re

def strip_top(page):
  """
    Every document inside the variable of pages has this string in the beginning, 'F80 /  F83 / F85 TREADMILL'.
    Therefore, the real content of the pdf begins after that string.
  """
  pattern = r"F80 /  F83 / F85 TREADMILL(.*)"
  for page in pages:
    match = re.search(pattern, page.page_content, re.DOTALL)

    clean_text = ""
    if match:
        clean_text = match.group(1).strip()
        clean_text = clean_text.replace("\n", " ")
        clean_text = clean_text.replace("\x84", "")
        clean_text = re.sub(r'\s+', ' ', clean_text)
        page.page_content = clean_text
    else:
      clean_text = page.page_content.replace("\n", " ")
      clean_text = clean_text.replace("\x84", "")
      clean_text = re.sub(r'\s+', ' ', clean_text)
      page.page_content = clean_text

In [255]:
# The pdf is in plain text
strip_top(pages)

In [279]:
import re

def split_by_header(headers, pages):

  list_of_string = []
  for page in pages:

    # Create a regular expression that pattern matches all strings in headers.
    pattern = '|'.join(map(re.escape, headers))

    # Split each page by the pattern created.
    split_result = re.split(pattern, page.page_content)

    # Get rid of any empty strings
    split_result = [s.strip() for s in split_result if s.strip() != '']

    list_of_string += split_result

  return list_of_string

In [276]:
text = split_by_header(headers, pages)

In [284]:
# Check to see if any string in the list has the possibility to exceed the text embedding token limit.

for string in text:
  word_count = len(string.split())
  assert (word_count <= 5500) == True, "A string may potentially exceed the token limit"

In [289]:
#Convert the list 'text' into a panda dataframe.

import pandas as pd

dataset_df = pd.DataFrame(text)
dataset_df.columns = ['text']

(43, 1)

In [290]:
import openai
from google.colab import userdata

openai.api_key = userdata.get('open_ai')

EMBEDDING_MODEL = 'text-embedding-3-small'

# This method will take a text and get an vector embedding from OpenAI
def get_embedding(text):
  if not text or not isinstance(text, str):
    return None

  try:
    embedding = openai.embeddings.create(input=text, model=EMBEDDING_MODEL).data[0].embedding
    return embedding
  except Exception as e:
    print(f'Error in get_embedding: {e}')
    return None

dataset_df['embedding'] = dataset_df['text'].apply(get_embedding)

dataset_df.head()

Unnamed: 0,text,embedding
0,PLEASE CAREFULLY READ THIS ENTIRE MANUAL BEFOR...,"[0.03159578889608383, 0.04981619864702225, -0...."
1,Product Registration Important Safety Instruct...,"[0.005775596015155315, 0.012183739803731441, 0..."
2,RECORD YOUR SERIAL NUMBER Please record the Se...,"[5.853141919942573e-05, -0.008088338188827038,..."
3,F80_F83_F85_20111222,"[0.0039483290165662766, 0.007950534112751484, ..."
4,WARNING - Read all instructions before using t...,"[-0.020630232989788055, 0.036889005452394485, ..."


In [291]:
import pymongo
from google.colab import userdata

# This method will connect to mongoDB and get a client instance.
def get_mongo_client(mongo_uri):
  try:
    client = pymongo.MongoClient(mongo_uri)
    print('Connection to MongoDB successful')
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f'Connection failed: {e}')
    return None

mongo_uri = userdata.get('mongo_url')
if not mongo_uri:
  print('Mongo_uri not set in envirnment variables')

mongo_client = get_mongo_client(mongo_uri)

db = mongo_client['Treadmill']
collection = db['F80_F83_F85']

documents = dataset_df.to_dict('records')
collection.insert_many(documents)

print('Data ingestion into MongoDB completed')

Connection to MongoDB successful
Data ingestion into MongoDB completed


In [295]:
def vector_search(user_query, collection):
  query_embedding = get_embedding(user_query)

  if query_embedding is None:
    return "Invalid query or embedding generation failed."

  pipeline = [
      {
          "$vectorSearch": {
              "index": "vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "numCandidates": 10,
              "limit": 5
          }
      },
      {
          "$project": {
              "text":1,
              "score": {
                  "$meta": "vectorSearchScore"
              }
          }
      }
  ]

  results = collection.aggregate(pipeline)
  return list(results)

In [293]:
def handle_user_query(query, collection):
  get_knowledge = vector_search(query, collection)

  search_result = ''
  for result in get_knowledge:
    search_result += f"Instruction: {result.get('text', 'N/A')} \n"

  completion = openai.chat.completions.create(
      model='gpt-3.5-turbo',
      messages=[
          {'role': "system", 'content': 'You are a treadmill technician gives helpful information by forming a cohisive answer with the context given. You are not allowed to pick anything outside of your choices. And do not mention anyhting along the lines "based on the context provided"'},
          {'role': 'user', 'content': 'Answer this user query: ' + query + ' using the following context: ' + search_result}
      ]
  )

  return (completion.choices[0].message.content), search_result

In [296]:
query = "How do I start my F80 treadmill?"
response, source_information = handle_user_query(query, collection)

print(f"Response: {response}")
print(f"Source Information: \n{source_information}")

Response: To start your F80 treadmill, begin by plugging it into a suitable wall outlet. Next, locate the power switch positioned below the motor hood at the front of the treadmill and turn it on. Ensure that the Safety Key is inserted as the treadmill will not activate without it. Once powered on, a message displaying the current software version will scroll across the dot matrix screen. Following this, the Time and Distance windows will show Odometer readings briefly - Time indicating hours of use and Distance showing miles (or kilometers if set to metric readings). The treadmill will then enter idle mode, which is the starting point for operation. Remember to always read and understand any operational changes before use and never operate the treadmill during an electrical storm to avoid potential damage.
Source Information: 
Instruction: F80 / F83 / F85 CONSOLES Power the treadmill on by plugging it into an appropriate wall outlet, then turn on the power switch located at the front 