<h1>Diamonds Are Forever</h1>
<h2>Import the needed modules</h2>

In [7]:
# !pip install --upgrade scikit-learn
# !pip install pinecone-client
# !pip install -U langchain-cli
# !pip install transformers

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
from dotenv import load_dotenv
import os
from typing import List

# Data handling
import pandas as pd

## Regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor 
from sklearn.neighbors import KNeighborsRegressor

# Modelling Helpers
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Pinecone
from pinecone import Pinecone, ServerlessSpec

# OpenAI
from openai import OpenAI

# Tokenization
import nltk
import tiktoken

# Downloads
nltk.download('punkt')

# Langchain
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain

# Transformers
from transformers import pipeline
import gradio as gr

# Matplotlib
import matplotlib.pyplot as plt

# Import all functions from file
from inc.functions import *


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\James\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h2>Define Needed Variables</h2>

In [2]:
# Load environment variables.
variables_to_clear = ['OPENAI_API_KEY',
                      'LANGCHAIN_TRACING_V2',
                      'LANGCHAIN_ENDPOINT',
                      'LANGCHAIN_API_KEY',
                      'LANGCHAIN_PROJECT',
                      'PINECONE_API_KEY']

for var in variables_to_clear:
    if var in os.environ:
        del os.environ[var]

load_dotenv("inc/api_keys.env")

## Get the API keys defined
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Check the API key
if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable is not set.")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Check the API key
if not OPENAI_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable is not set.")

pc = Pinecone(api_key=PINECONE_API_KEY)

## Attempt to access the index
try:
    index = pc.Index("diamonds")
    print("Successfully accessed the index 'diamonds'.")
except Exception as e:
    print(f"Error accessing the index 'diamonds': {e}")

## Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
EMBED_MODEL = "text-embedding-ada-002"

client = OpenAI(api_key=OPENAI_API_KEY)
MAX_TOKENS = 1536

# Define vector list for chunking
vectors = []
filename = 'README1.md'

# Pull book and article text for chunking
text = get_diamond_info()

query_responses = []
answers = []
questions = ["What is the most famous type of diamond cut?",
            "What is the process of diamond certification?",
            "Who is the top diamond dealer in the world?",
            "What are the signs of diamond impurities?",
            "What are Blood diamonds?",
            "What is the history of the diamond industry?",
            "Who are the major players in the diamond market?",
            "How many diamonds are typically used in high-end jewelry?",
            "What will Langsmith help us learn about diamond appraisals?"]

Successfully accessed the index 'diamonds'.


In [3]:
""" Added Below for chunking"""
def prep(text: str):
    return text.replace("\n", " ").replace("\r", " ").replace("\t", " ")

def tokenize(text: List[str]):
    encoding = tiktoken.encoding_for_model(EMBED_MODEL)
    return encoding.encode(text)

def embed(tokens: List[int]):
    response = client.embeddings.create(input=tokens,model=EMBED_MODEL)
    return response.data[0].embedding

def chunk_text(text:str):
    current_chunk = []
    current_para = ""
    chunks = []
    paras = []
    current_len = 0
    sentences = nltk.sent_tokenize(text)
    chunks_of_tokens = []
    
    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize(sentence)
        sentence_token_len = len(sentence_tokens)
        
        # Check if adding the next sentence exceeds the max token limit
        if current_len + sentence_token_len > MAX_TOKENS:
            # Add the current chunk to the list and start a new one
            paras.append(current_para)
            current_para = ""
            chunks_of_tokens.append(current_chunk)
            embeddings = embed(current_chunk)
            chunks.append(embeddings)
            current_chunk = []
            current_len = 0
        
        # Add the sentence to the current chunk
        current_para += " " + sentence
        current_chunk.extend(sentence_tokens)
        current_len += sentence_token_len
    
    # Add the last chunk if it's not empty
    if current_chunk:
        paras.append(current_para)
        chunks_of_tokens.append(current_chunk)
        embeddings = embed(current_chunk)
        chunks.append(embeddings)

    return paras, chunks, chunks_of_tokens

def create_embeddings(filename: str):
    with open(filename, "r") as file:
        text = file.read()
    text = prep(text)
    return chunk_text(text)
    
def create_embeddings_prompt(prompt:str):
    prompt = prep(prompt)
    return chunk_text(prompt)

def vectorize_chunks(paras: List, chunks: List, **kwargs):
    vectors = []
    for i in range(len(chunks)):
        if "filename" in kwargs:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"file": filename, "para": f"{paras[i]}"}})
        else:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"para": f"{paras[i]}"}})
        
    return vectors


def ask_a_question(prompt):
    # convert the prompt to chunks of  embeddings
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(prompt)
    print(f"Embeddings: {chunks[0]}")
    # vectorize the embeddings
    prompt_vectors = vectorize_chunks(paras, chunks)
    print(f"Vectorized: {prompt_vectors[0]}")
    # search the index for the best match using semantic search
    query_response = index.query(
        top_k=2,
        vector=prompt_vectors[0]["values"]
    )
    query_responses.append(query_response)
    print(f"Query response: {query_response}")
    # get the id of the best match
    best_id = query_response["matches"][0]["id"]
    print(f"Best ID: {best_id}")
    # fetch the best match from the index
    result = index.fetch(ids=[best_id])
    # get the paragraph of interest from the result metadata
    para_of_interest = result["vectors"][best_id]["metadata"]["para"]
    print(f"Para of interest: {para_of_interest}")
    # Initialize the langchain chat model.
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.0)
    # turn the para_of_interest into a Document
    document = Document(page_content=para_of_interest)
    # Create the QA chain using the LLM.
    chain = load_qa_chain(llm)
    # Pass the para_of_interest and the prompt to the chain, and print the result.
    question = "If you can't find the answer in the provided document, say, I don't know the answer to that beautiful person, otherwise, answer the question. " + prompt
    result = chain.invoke({"input_documents": [document], "question": question})
    return result["output_text"]

<h3>Vectorize Information</h3>

In [4]:
## Vectorize text information
# Vectors from text from get_diamond_info()
paras, chunks, chunks_of_tokens  = create_embeddings_prompt(text)
vectors_from_text = vectorize_chunks(paras, chunks)
vectors.extend(vectors_from_text)

# Vectors from filename README1.md
paras, chunks, chunks_of_tokens = create_embeddings(filename)
vectors_from_file = vectorize_chunks(paras, chunks, filename=filename)
vectors.extend(vectors_from_file)

# Create index.upsert from vectors above
index.upsert(
    vectors=vectors     
)

{'upserted_count': 91}

<h4>Question the chatbot</h4>

In [5]:
for question in questions:
    answers.append(ask_a_question(question))

Embeddings: [-0.015488379634916782, 0.0076020946726202965, 0.005125116556882858, -0.01578548736870289, -0.01063130609691143, 0.01889866590499878, -0.029710819944739342, 0.0063232374377548695, -0.009055341593921185, -0.008603219874203205, 0.01164534967392683, 0.012233107350766659, -0.004136908799409866, -0.0081510990858078, -0.011839115992188454, 0.024543721228837967, 0.026946421712636948, 0.011180310510098934, 0.0184077899903059, -0.009178060106933117, -0.03973499312996864, 0.005874346010386944, 0.012982336804270744, -0.021314283832907677, -0.0066268048249185085, -0.0048409258015453815, 0.029271617531776428, -0.012594804167747498, -0.0057968394830822945, -0.02265772968530655, 0.018394872546195984, 0.0030728091951459646, -0.03327611833810806, -0.0017584284069016576, -0.018265696242451668, 0.018317366018891335, 3.910646410076879e-05, 0.002478592796251178, 0.011619513854384422, 0.005025004036724567, 0.007001419551670551, -0.008674267679452896, 6.146022496977821e-05, -0.0032552725169807673

<h4>Test the results of the answers.</h4>

In [6]:
ix = 0
for query_response in query_responses:
    print(f"Match Score: {query_response['matches'][0]['score']}")
    print(f"Question: {questions[ix]}")
    print(f"Answer:   {answers[ix]}\n\n")
    ix += 1

Match Score: 0.862713218
Question: What is the most famous type of diamond cut?
Answer:   The most popular and famous type of diamond cut is the brilliant cut.


Match Score: 0.870021701
Question: What is the process of diamond certification?
Answer:   The process of diamond certification involves having a diamond evaluated and graded based on its characteristics such as carat weight, cut, color, and clarity, which are known as the 4Cs. This evaluation is typically conducted by a reputable gemological laboratory like the Gemological Institute of America (GIA) or the International Gemological Institute (IGI). Once the assessment is complete, a certificate detailing these characteristics is provided to verify the diamond's quality and authenticity.


Match Score: 0.834507346
Question: Who is the top diamond dealer in the world?
Answer:   I don't know the answer to that beautiful person.


Match Score: 0.829236031
Question: What are the signs of diamond impurities?
Answer:   I don't know 

<h2>Ask it a question</h2>

In [8]:
app = gr.Interface(fn=ask_a_question,
                   inputs=gr.Textbox(label="Ask me about Diamonds"),
                   outputs=gr.Textbox(lines=10, label="Your answer about diamonds:", show_copy_button=True))
app.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Embeddings: [-0.01576436311006546, -0.025222981348633766, 0.0023195657413452864, -0.03532285988330841, -0.015096381306648254, 0.008309689350426197, -0.022457538172602654, -0.018369490280747414, -0.023339273408055305, -0.01969209313392639, 0.02076086401939392, 0.010841339826583862, -0.018409568816423416, -0.027547556906938553, -0.012765126302838326, 0.01583116129040718, 0.033452510833740234, -0.02072078548371792, 0.02471531555056572, -0.01470895204693079, -0.032837968319654465, 0.010734462179243565, 0.013352950103580952, -0.003860933007672429, -0.008977671153843403, -0.0018219194607809186, 0.014254724606871605, -0.009966284036636353, -0.004619091749191284, -0.005430689547210932, 0.0019905848894268274, -0.011295566335320473, -0.00784878246486187, 0.009819327853620052, -0.014374961145222187, 0.022911764681339264, -0.0015154830180108547, 0.003787454916164279, 0.004796106833964586, 0.020319996401667595, -0.002683615544810891, 0.004124785773456097, 0.001634884625673294, -0.017167123034596443