In [2]:
!pip install datasets pandas openai pymongo pypdf langchain_community

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting openai
  Downloading openai-1.35.3-py3-none-any.whl (327 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.4/327.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymongo
  Downloading pymongo-4.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (669 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m669.1/669.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# The url for the user manual pdf.
url = "https://raw.githubusercontent.com/MatthewHsu1/RAGsystemOpenAI/main/F80_F83_F85_OM_810.pdf"

# Initialize PyPDFLoader with downloaded user manual pdf
loader = PyPDFLoader(url)
pages = loader.load_and_split()

In [None]:
# First create a dictionary of headers to parse the pdf by headers.

headers = [
    "TABLE OF CONTENTS",
    "PRODUCT REGISTRATION",
    "IMPORTANT SAFETY INSTRUCTIONS",
    "IMPORTANT ELECTRICAL INSTRUCTIONS",
    "GROUNDING INSTRUCTIONS",
    "IMPORTANT OPERATION INSTRUCTIONS",
    "IMPORTANT SAFETY INSTRUCTIONS",
    "IMPORTANT SAFETY INSTRUCTIONS",
    "PREVENTATIVE MAINTENANCE CHART",
    "F80 / F83 / F83 ASSEMBLY PACK CHECKLIST",
    "F80 / F83 / F83 ASSEMBLY INSTRUCTIONS",
    "FOLDING INSTRUCTIONS",
    "TRANSPORTATION INSTRUCTIONS",
    "OPERATION OF YOUR TREADMILL",
    "GETTING STARTED",
    "QUICK-START OPERATION",
    "PAUSE/STOP/RESET FEATURE",
    "INCLINE FEATURE",
    "DOT MATRIX CENTER DISPLAY",
    "PROGRAMMABLE FEATURES",
    "HEART RATE PROGRAMS",
    "USING HEART RATE TRANSMITTER",
    "GENERAL MAINTENANCE",
    "BELT ADJUSTMENTS",
    "TREAD-BELT TRACKING ADJUSTMENT",
    "BELT/DECK LUBRICATION",
    "SERVICE CHECKLIST - DIAGNOSIS GUIDE",
    "MANUFACTURER’S LIMITED WARRANTY"
]

In [None]:
import re

def strip_top(page):
  """
    Every document inside the variable of pages has this string in the beginning, 'F80 /  F83 / F85 TREADMILL'.
    Therefore, the real content of the pdf begins after that string.
  """
  pattern = r"F80 /  F83 / F85 TREADMILL(.*)"
  for page in pages:
    match = re.search(pattern, page.page_content, re.DOTALL)

    clean_text = ""
    if match:
        clean_text = match.group(1).strip()
        clean_text = clean_text.replace("\n", " ")
        clean_text = clean_text.replace("\x84", "")
        clean_text = re.sub(r'\s+', ' ', clean_text)
        page.page_content = clean_text
    else:
      clean_text = page.page_content.replace("\n", " ")
      clean_text = clean_text.replace("\x84", "")
      clean_text = re.sub(r'\s+', ' ', clean_text)
      page.page_content = clean_text

In [None]:
# The pdf is in plain text
strip_top(pages)

In [None]:
import re

def split_by_header(headers, pages):

  list_of_string = []
  for page in pages:

    # Create a regular expression that pattern matches all strings in headers.
    pattern = '|'.join(map(re.escape, headers))

    # Split each page by the pattern created.
    split_result = re.split(pattern, page.page_content)

    # Get rid of any empty strings
    split_result = [s.strip() for s in split_result if s.strip() != '']

    list_of_string += split_result

  return list_of_string

In [None]:
text = split_by_header(headers, pages)

In [None]:
# Check to see if any string in the list has the possibility to exceed the text embedding token limit.

for string in text:
  word_count = len(string.split())
  assert (word_count <= 5500) == True, "A string may potentially exceed the token limit"

In [None]:
#Convert the list 'text' into a panda dataframe.

import pandas as pd

dataset_df = pd.DataFrame(text)
dataset_df.columns = ['text']

(43, 1)

In [9]:
import openai
from google.colab import userdata

openai.api_key = userdata.get('open_ai')

EMBEDDING_MODEL = 'text-embedding-3-small'

# This method will take a text and get an vector embedding from OpenAI
def get_embedding(text):
  if not text or not isinstance(text, str):
    return None

  try:
    embedding = openai.embeddings.create(input=text, model=EMBEDDING_MODEL).data[0].embedding
    return embedding
  except Exception as e:
    print(f'Error in get_embedding: {e}')
    return None

dataset_df['embedding'] = dataset_df['text'].apply(get_embedding)

dataset_df.head()

In [7]:
import pymongo
from google.colab import userdata

# This method will connect to mongoDB and get a client instance.
def get_mongo_client(mongo_uri):
  try:
    client = pymongo.MongoClient(mongo_uri)
    print('Connection to MongoDB successful')
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f'Connection failed: {e}')
    return None

mongo_uri = userdata.get('mongo_url')
if not mongo_uri:
  print('Mongo_uri not set in envirnment variables')

mongo_client = get_mongo_client(mongo_uri)

db = mongo_client['Treadmill']
collection = db['F80_F83_F85']

documents = dataset_df.to_dict('records')
collection.insert_many(documents)

print('Data ingestion into MongoDB completed')

Connection to MongoDB successful


## Below is the query part

In [None]:
import openai
from google.colab import userdata

openai.api_key = userdata.get('open_ai')

EMBEDDING_MODEL = 'text-embedding-3-small'

# This method will take a text and get an vector embedding from OpenAI
def get_embedding(text):
  if not text or not isinstance(text, str):
    return None

  try:
    embedding = openai.embeddings.create(input=text, model=EMBEDDING_MODEL).data[0].embedding
    return embedding
  except Exception as e:
    print(f'Error in get_embedding: {e}')
    return None

In [None]:
import pymongo
from google.colab import userdata

# This method will connect to mongoDB and get a client instance.
def get_mongo_client(mongo_uri):
  try:
    client = pymongo.MongoClient(mongo_uri)
    print('Connection to MongoDB successful')
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f'Connection failed: {e}')
    return None

mongo_uri = userdata.get('mongo_url')
if not mongo_uri:
  print('Mongo_uri not set in envirnment variables')

mongo_client = get_mongo_client(mongo_uri)

db = mongo_client['Treadmill']
collection = db['F80_F83_F85']

In [5]:
def vector_search(user_query, collection):
  query_embedding = get_embedding(user_query)

  if query_embedding is None:
    return "Invalid query or embedding generation failed."

  pipeline = [
      {
          "$vectorSearch": {
              "index": "vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "numCandidates": 10,
              "limit": 4
          }
      },
      {
          "$project": {
              "text":1,
              "score": {
                  "$meta": "vectorSearchScore"
              }
          }
      }
  ]

  results = collection.aggregate(pipeline)
  return list(results)

In [11]:
def handle_user_query(query, collection):
  get_knowledge = vector_search(query, collection)

  search_result = ''
  for result in get_knowledge:
    search_result += f"Instruction: {result.get('text', 'N/A')} \n"

  completion = openai.chat.completions.create(
      model='gpt-3.5-turbo',
      messages=[
          {'role': "system", 'content': 'You are a treadmill technician that provides helpful information by forming a cohisive answer with the context given. If possible, you can use numbered list or bullet points. You are not allowed to pick anything outside of your choices. And do not mention anyhting along the lines "based on the context provided"'},
          {'role': 'user', 'content': 'Answer this user query: ' + query + ' using the following context: ' + search_result}
      ]
  )

  return (completion.choices[0].message.content), search_result

In [14]:
query = "what to do if its making a noise while running?"
response, source_information = handle_user_query(query, collection)

print(f"Response: {response} \n")
print(f"Source Information: \n{source_information}")

Response: If your treadmill is making a noise while running, here are steps to address the issue based on the context provided:

1. **Check Tread-Belt Tension**:
   - Use the 6 mm Allen wrench provided to adjust the tension at the rear roller.
   - Tighten the rear roller to prevent slippage at the front roller.
   - Turn the tension adjusting bolts 1/4 turn each and test the tension by walking on the belt to ensure it's not slipping.

2. **Ensure Proper Belt Tracking**:
   - Use the left side bolt for tracking adjustments with the Allen wrench.
   - Set the belt speed at 3 mph for adjustments.
   - Make small 1/4 turn adjustments to center the tread-belt if it drifts towards one side while in use.

3. **General Maintenance**:
   - Clean the deck monthly to maximize performance.
   - Use a mild soap solution and nylon scrub brush to clean the textured belt.
   - Regularly vacuum underneath and inside the treadmill to prevent dirt buildup.

4. **Avoid Over-tightening**:
   - Do not over

## Change the model to assitants with file seach and see if file search and external vector search is good.