# **Final Project**

In [None]:
# !pip install -r requirements.txt
# !pip install ipykernel langchain_experimental llama-index-vector-stores-pinecone ipykernel PyMuPDF pinecone-client pypdf faiss-cpu langchain_community transformers sentence_transformers
# !pip install llama_index.embeddings.huggingface

In [None]:
import io
import json
import math
import os
import re
import sqlite3
import time

import datasets
import dotenv
import faiss
import fitz
import huggingface_hub
import langchain
import langchain_community
import nltk
import numpy as np
import openai
import pandas as pd
import pinecone
import pypdf
import requests
import torch
import transformers
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from llama_index.core import (SimpleDirectoryReader, StorageContext,
                              VectorStoreIndex)
from llama_index.core.extractors import (QuestionsAnsweredExtractor,
                                         TitleExtractor)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import TextNode
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [None]:
load_dotenv()

#access_endpoint_api_key =os.getenv('access_endpoint_api_key')
openai_api_key = 'Api Key placeholder'
pinecone_api_key='Api Key placeholder'
# environment =os.getenv('PINECONE_ENV')
# HF_TOKEN = os.getenv('HF_TOKEN')

# configure Pinecone client
pc = Pinecone(api_key=pinecone_api_key)

In [None]:
doc = fitz.open("Product_review.pdf")


### **Token chunking method to chunk the text data:**

- [Token chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/split_by_token)

##### Choose a type of chunker (From langchain):

In [None]:

# # parser to split up product review:
text_parser = TokenTextSplitter(
    chunk_size=1024
)

In [None]:
text_chunks = []
doc_idxs = []


for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [None]:
text_chunks

['Product 1: Mesh Office Chair Mid Back Swivel Ergonomic Home \nDesk Chair with Flip-up Arms \n \nDescription:  \n● The computer desk chair is made of the high-density and breathable mesh \nback with lumbar support and thick cushion to provide you the most \ncomfortable posture. The thickened explosion-proof seat plate give you more \ncomfort and double coverage. Max capacity: 300 lbs. \n● The mesh office chair comes with adjustable padded armrests which can be \nflipped up or down. It saves your space and can be easily stored. Suitable for \noffice, living room, study room, meeting room, etc. \n● Pneumatic seat height adjustment; Tilt lock function(angle adjustable); 360 \ndegree swivel; Solid rolling silencer casters for smooth movement. \n● We offer installation instruction and video to help you. It usually takes about \n10-20 minutes to assemble. If you have any questions, please feel free to \ncontact us. \nReviews: \n● Easy to assemble and made with decent materials. And it is co

In [None]:
nodes = []

for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

### **Create the vector store using chosen similarity metrics:**

In [None]:
use_serverless = True #os.environ.get("USE_SERVERLESS", "False").lower() == "true"

In [None]:
spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )

# specify the Pinecone environment to use:
if use_serverless:
    spec = pinecone.ServerlessSpec(cloud='aws', region="us-east-1")
else:
   spec = pinecone.PodSpec(environment=environment)

In [None]:
# Name our Pinecone Index:
# index_name = "hw02-rag"
index_name = "hw04"

# If a Pinecone index of the same name already exists, delete it:
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

### **choose a similarity metric to use for the vector store:**

In [None]:

# define similarity and additional parameters for the vector store index:
dimensions = 1536 #364  #1536 #768              # the dimensions of the index need to align with the LLM we are using for the RAG system. For example, if using openAI then dimenion = 1536. If using Llama2, then dimension = 384.

# "dotproduct" is one similarity metric we can for the vector store index. We can use different distance metrics to measure the similarity between vector embeddings and user queries. This is where we define what similarity metric we are going to use for the vector store.
# "cosine" is another similarity metric we can use for the vector store index.
# "euclidean" is another similarity metric we can use for the vector store index.

pc.create_index(
    name=index_name,
    dimension=dimensions,
    metric="cosine",          # we can use different distance metrics to measure the similarity between vector embeddings and user queries. this is where we define what similarity metric we are going to use for the vector store.
    spec=spec
)

# wait for index to be ready before connecting
while not pc.describe_index(index_name).status['ready']:
   time.sleep(1)

for index in pc.list_indexes():
    print(index['name'])


pc.describe_index("hw04")


pc_index = pc.Index(index_name)  # create an index to use in the vector store


vector_store = PineconeVectorStore(pinecone_index=pc_index)    # this function creates a vector store where we will add and store embeddings

hw04
hw02-rag
hw02


In [None]:
pc_index = pc.Index(index_name)  # create an index to use in the vector store
vector_store = PineconeVectorStore(pinecone_index=pc_index)    # this function creates a vector store where we will add and store embeddings

In [None]:

pc_index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
llm = OpenAI(model="gpt-3.5-turbo",
             api_key=openai_api_key)

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:01<00:00,  4.23it/s]
100%|██████████| 8/8 [00:02<00:00,  3.62it/s]


### ***choose an embedding model to use for the vector store:**

#### **OpenAI Embeddings**

In [None]:
model_ada="text-embedding-ada-002"
small_txt_embedmodel_="text-embedding-3-small"


In [None]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key=openai_api_key)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

### **load the embeddings into the vector store (e.g. create a vector store):**

In [None]:
vector_store.add(nodes)

Upserted vectors: 100%|██████████| 8/8 [00:00<00:00, 29.09it/s]


['a6473a45-e86f-4211-89fb-e51ae7293a95',
 '032f414a-7918-4a14-9f46-8e2c2f12a676',
 'cb2d9631-221f-4cc7-be78-46d2edf61428',
 'cf1a1fb3-d0ef-43d8-8f9d-ae6a92d2c7ed',
 'e351fbc3-6448-4a9f-9950-990d1581ed52',
 'f6b38dc0-0749-4de0-b388-0588f21f81ee',
 'd77be865-3ceb-4a88-815c-3856a174fa32',
 '7086d367-115f-4871-8905-0816c02604db']

In [None]:

pc_index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
print(nodes[0].metadata)

{'document_title': '"Comprehensive Review of Office Chairs and Monitors: Comfort, Visuals, Connectivity, and Customer Service"', 'questions_this_excerpt_can_answer': '1. How long does it typically take to assemble the Mesh Office Chair Mid Back Swivel Ergonomic Home Desk Chair with Flip-up Arms?\n2. What are some common positive reviews from customers who have purchased the Mesh Office Chair?\n3. What specific features does the Mesh Office Chair offer, such as adjustable armrests and pneumatic seat height adjustment?'}


In [None]:
nodes

[TextNode(id_='a6473a45-e86f-4211-89fb-e51ae7293a95', embedding=[0.015835769474506378, 0.043330978602170944, 0.00570418406277895, 0.021420542150735855, 0.0021953301038593054, -0.045804932713508606, -0.024372143670916557, 0.03608056902885437, 0.03485583886504173, 0.02706655114889145, 0.021138854324817657, -0.027605433017015457, -0.005529659800231457, -0.06025675684213638, -0.010838868096470833, 0.048523835837841034, -0.006833998020738363, -0.013055630959570408, -0.026699131354689598, 0.014733511954545975, 0.030103884637355804, 0.0432329997420311, 0.005841966252774, 0.01987738162279129, -0.03571315109729767, 0.031010184437036514, -0.00830367486923933, -0.04849934205412865, -0.025278445333242416, -0.023453595116734505, 0.008897669613361359, -0.02922207862138748, 0.021249080076813698, -0.03103468008339405, -0.029809948056936264, 0.013949683867394924, -0.045217063277959824, 0.01084499154239893, 0.013092372566461563, -0.029516013339161873, 0.04928316920995712, 0.062265314161777496, 0.0172442

In [None]:
print(nodes[0])

Node ID: a6473a45-e86f-4211-89fb-e51ae7293a95
Text: Product 1: Mesh Office Chair Mid Back Swivel Ergonomic Home
Desk Chair with Flip-up Arms    Description:   ● The computer desk
chair is made of the high-density and breathable mesh  back with
lumbar support and thick cushion to provide you the most  comfortable
posture. The thickened explosion-proof seat plate give you more
comfort and double ...


### **Retrieve Content from the Vector Store**

In [None]:
client = openai.OpenAI(api_key=openai_api_key)

In [None]:
# query the vector store with the 5 queries above (don't forget to record the responses in your homework submission!)

k = 5
queries = [
    "Summarize the product description and customer reviews in a concise bullet-point format highlighting the key features and overall user sentiment for mesh office chair.",
    "Summarize the product description and customer reviews in a concise bullet-point format highlighting the key features and overall user sentiment Dell monitor.",
    "Summarize the product description and customer reviews in a concise bullet-point format highlighting the key features and overall user sentiment Bomber jacket.",
    "Based on customer reviews, identify the key visual features of this office chair, including its material, color, shape, armrest design, backrest style, and any additional ergonomic details.",
    "Extract the visual characteristics of this Dell monitor from customer reviews, including screen size, bezel thickness, color, stand design, and any notable display features.",
    "Analyze customer reviews to determine the key visual features of this bomber jacket, such as material (e.g., leather, nylon), color options, fit (slim or oversized), collar style, pocket placement, and any distinctive design elements."
]
responses = []


# Choose one of these models:
embed_model_ada = "text-embedding-ada-002"
embed_model_3_small = "text-embedding-3-small"

for query in tqdm(queries):
    res = client.embeddings.create(
        input=[query],
        model=embed_model_ada
    )

    # Retrieve from Pinecone
    xq = res.data[0].embedding  # res['data'][0]['embedding']

    # Get relevant contexts (including the questions)
    res2 = pc_index.query(vector=xq, top_k=k, include_metadata=True)

    # Add response results
    responses.append(res2)

100%|██████████| 6/6 [00:03<00:00,  1.98it/s]


In [None]:
chat_responses = []
for query, response in zip(queries, responses):
    chat_response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "Instruction: use the information in {response} to answer the user's question."},
        {"role": "user", "content": query},
        {"role": "assistant", "content": str(response)},
        {"role": "user", "content": "What is the answer?"}
    ]
    )
    chat_responses.append(chat_response.choices[0].message.content)

In [None]:
print(chat_responses[0])

Here is a summary of the product description and customer reviews for a mesh office chair:

**Product Features:**
- Ergonomic design with adjustable height and tilt functions.
- Breathable mesh back for comfort and temperature regulation.
- Durable construction with a sturdy base and smooth-rolling casters.
- Lumbar support for improved posture during long hours of sitting.
- Contoured seat cushion for enhanced comfort.

**Customer Reviews and Sentiment:**
- Generally positive feedback on comfort and support, particularly for extended use.
- Many users appreciate the breathability of the mesh material.
- Complaints mainly involve assembly difficulties and durability of certain components.
- Overall satisfaction is high, with several reviewers noting the chair's good value for the price.

This summarizes key features and overall user sentiment for the mesh office chair.


In [None]:
print(chat_responses[1])

**Dell Monitor Summary:**

- **Key Features:**
  - 27-inch 4K UHD IPS display for vivid clarity.
  - Ultra-thin bezel design for immersive viewing.
  - AMD FreeSync technology for smooth visuals.
  - Connectivity options include HDMI and DisplayPort.

- **Customer Reviews:**
  - Positive feedback on build quality, clear picture, and ease of setup.
  - Color accuracy and picture quality praised, particularly for photo and video editing.
  - Some users report issues with color calibration and compatibility with M1 MacBook Pro.
  - Mixed experiences with customer service; some issues with replacements and setup instructions.
  - Overall sentiment is generally positive with some exceptions.


In [None]:
print(chat_responses[2])

- **Product Description:**
  - High visibility insulated bomber jacket designed for road work and construction.
  - Features a weatherproof polyester shell with a quilted lining.
  - Includes a concealed hood, zipper front closure, and reflective material.
  - Offers 5 pockets including a cell phone/radio chest pocket with a clear panel.
  - Available in a range of sizes from Small to 6XLarge.

- **Customer Reviews:**
  - General satisfaction with the jacket's warmth, quality, and visibility.
  - Mixed feedback on the hood; some find it bulky or not very functional.
  - Positive comments on fast shipping and good material quality.
  - Some issues with sizing discrepancies and customer service responsiveness.
  - Overall, most customers find the jacket to offer good value and effectiveness for its purpose.


In [None]:
print(chat_responses[3])

Based on customer reviews, the key visual features of the office chair include:

- **Material**: Made from high-quality fabric or synthetic leather, providing durability and comfort.
- **Color**: Comes in a variety of colors, often including black, grey, and other neutral tones to match office decor.
- **Shape**: Ergonomically designed with a contoured seat and backrest to support the body.
- **Armrest Design**: Features adjustable armrests that can be raised or lowered for optimal comfort.
- **Backrest Style**: Typically includes a mesh backrest for breathability or a padded backrest for added comfort.
- **Additional Ergonomic Details**: Includes features like lumbar support, adjustable tilt and height mechanisms, and a swivel base with caster wheels for easy movement.

These features are intended to provide comfort during long hours of sitting and to fit various body types and preferences.


In [None]:
print(chat_responses[4])

The visual characteristics of the Dell monitor based on customer reviews include:

- **Screen Size:** The monitor is described as being large and suitable for productivity tasks but no specific size is mentioned.
- **Bezel Thickness:** The monitor has a minimal bezel with about 3/8-inch at the top and sides, and about 3/4-inch at the bottom, offering a nearly full-screen display experience.
- **Color:** The color quality is described positively, with colors being true and accurate, although some customers experienced issues with color calibration.
- **Stand Design:** Initially wobbly for some, but stable once the connection between the base and the post is tightened as per the set-up instructions.
- **Notable Display Features:** The monitor offers a crisp, sharp picture with effective anti-glare coating. It does not support USB-C/Thunderbolt connectivity, limiting it to HDMI and DisplayPort. There are some compatibility issues with M1 MacBook Pros, and it's noted for having bright defa

In [None]:
print(chat_responses[5])

The reviews highlight several key visual features of the bomber jacket:

1. **Material**: The jacket has a weatherproof polyester shell with a quilted lining.
2. **Color Options**: It is available in a high visibility lime green with a black bottom.
3. **Fit**: It does not specify if it's slim or oversized, but it's available in sizes ranging from Small to 6XL.
4. **Collar Style**: It includes a concealed hood, but no specific collar style is mentioned.
5. **Pocket Placement**: There are a total of 5 pockets, with 4 on the outside and 1 on the inside. There is a cell phone/radio chest pocket with a clear panel.
6. **Distinctive Design Elements**: The jacket features 2" silver reflective material, mic tabs, and pencil pockets, indicating that it is designed for high visibility and work utility.

Overall, the jacket is designed for road work or construction with safety and functionality in mind.


In [None]:
# query the vector store with the 5 queries above (don't forget to record the responses in your homework submission!)

k = 5
queries = [
    "Summarize the product description and customer reviews in a concise bullet-point format highlighting the key features and overall user sentiment for mesh office chair.",
    "Summarize the product description and customer reviews in a concise bullet-point format highlighting the key features and overall user sentiment Dell monitor.",
    "Summarize the product description and customer reviews in a concise bullet-point format highlighting the key features and overall user sentiment Bomber jacket.",
    "Based on customer reviews, identify the key visual features of this office chair, including its material, color, shape, armrest design, backrest style, and any additional ergonomic details in 70 tokens.",
    "Extract the visual characteristics of this Dell monitor from customer reviews, including screen size, bezel thickness, color, stand design, and any notable display features in 70 tokens.",
    "Analyze customer reviews to determine the key visual features of this bomber jacket, such as material (e.g., leather, nylon), color options, fit (slim or oversized), collar style, pocket placement, and any distinctive design elements in 70 tokens."
]
responses = []


# Choose one of these models:
embed_model_ada = "text-embedding-ada-002"
embed_model_3_small = "text-embedding-3-small"

for query in tqdm(queries):
    res = client.embeddings.create(
        input=[query],
        model=embed_model_ada
    )

    # Retrieve from Pinecone
    xq = res.data[0].embedding  # res['data'][0]['embedding']

    # Get relevant contexts (including the questions)
    res2 = pc_index.query(vector=xq, top_k=k, include_metadata=True)

    # Add response results
    responses.append(res2)

100%|██████████| 6/6 [00:04<00:00,  1.25it/s]


In [None]:
chat_responses = []
for query, response in zip(queries, responses):
    chat_response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "Instruction: use the information in {response} to answer the user's question."},
        {"role": "user", "content": query},
        {"role": "assistant", "content": str(response)},
        {"role": "user", "content": "What is the answer?"}
    ]
    )
    chat_responses.append(chat_response.choices[0].message.content)

In [None]:
print(chat_responses[3])

Based on customer reviews, the key visual features of this office chair include its sleek, ergonomic shape designed for comfort. Commonly made with breathable mesh material, it often comes in neutral colors like black or gray to fit various office aesthetics. The chair features adjustable armrests to support different arm positions and a contoured backrest that provides lumbar support for better posture. Ergonomic details like height adjustment, tilt tension control, and a swivel base contribute to its functionality, making it a popular choice for office workers seeking both style and comfort.


In [None]:

print(chat_responses[4])

The Dell monitor is noted for its 27-inch screen size with a 4K UHD IPS display, known for remarkable clarity and vibrant colors. It features an ultra-thin bezel offering an immersive viewing experience. The monitor’s color accuracy and picture quality are highlighted, especially for photo and high-quality video work. It includes AMD FreeSync technology for smooth visuals, making it ideal for both work and play. The stand design is sturdy, with additional connectivity options including HDMI and DisplayPort, enhancing its functionality as a modern, adaptable monitor.


In [None]:

print(chat_responses[5])

The bomber jacket is typically made of weatherproof polyester with a quilted lining. It is available in high visibility lime green with a black bottom. The fit is not specified, but it is available in various sizes from Small to 6XLarge. The jacket features a zipper front closure and comes with an attachable, concealed hood. For storage, it includes five pockets: four on the outside and one inside, with a cell phone/radio chest pocket featuring a clear panel. Additionally, the jacket has mic tabs, a pencil pocket, and meets ANSI/ISEA 107-2015 Type R, Class 3 standards.
