In [105]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [106]:
#Setting up the model


from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4o") 

In [107]:
#Check to see if the model we can invoke the model

model.invoke("Who is the HyperSn1per?")

AIMessage(content='As of my last update in 2023, there isn\'t any widely recognized individual or entity known specifically as "HyperSn1per." This could be a username or handle used by someone on social media, gaming platforms, or other online communities. If "HyperSn1per" is someone specific you\'re referring to, please provide more context so I can better assist you!', response_metadata={'token_usage': {'completion_tokens': 74, 'prompt_tokens': 15, 'total_tokens': 89}, 'model_name': 'gpt-4o', 'system_fingerprint': 'fp_43dfabdef1', 'finish_reason': 'stop', 'logprobs': None})

In [108]:
#We get an'AIMessage'. 
# We chain the the model with an output parser to get a string output.

from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("How does the message look like now?")

"I'm not exactly sure which message you are referring to. Could you please provide more context or details about the message you're asking about? This will help me give you a more accurate and helpful response."

In [109]:
# We want to provide the model with some context and the question.
# We use prompt templates, which are a way to format the input for the model.

from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I have no knowledge of that".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Barbara Scalvini and Rúni Klein Hansen", question="Who is supervising Jacobs bachelor project?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I have no knowledge of that".\n\nContext: Barbara Scalvini and Rúni Klein Hansen\n\nQuestion: Who is supervising Jacobs bachelor project?\n'

In [110]:
# chain the prompt, model and parser together to get the answer.

chain = prompt | model | parser
chain.invoke({
    "context": "Barbara Scalvini and Rúni Klein Hansen are supervising Jacobs bachelor project.",
    "question": "Who is supervising Jacobs bachelor project?"
})

'Barbara Scalvini and Rúni Klein Hansen are supervising Jacobs bachelor project.'

In [111]:
# Read the file with UTF-8 encoding
with open("Artificialus_Gaia_Terra_Simple_Format.txt", encoding="utf-8") as file:
    content = file.read()

# Replace the problematic character '\x81' with a suitable character or remove it
content_cleaned = content.replace('\x81', '')

# Save the cleaned text to a new file with CP1252 encoding
with open("Artificialus_Gaia_Terra_Simple_Format_CP1252.txt", "w", encoding="cp1252", errors='replace') as file:
    file.write(content_cleaned)


In [112]:
with open("Artificialus_Gaia_Terra_Simple_Format_CP1252.txt") as file:
    artificialus = file.read()
artificialus[:500]

'# Discovery and Exploration of Artificialus Gaia Terra\n\n## Introduction: The Discovery of Artificialus Gaia Terra\n\nIn the year 2045, an international team of astronomers and astrophysicists embarked on a groundbreaking mission to explore the uncharted regions of the galaxy. Utilizing the latest advancements in deep space telescopy and interstellar probe technology, the team aimed to uncover new celestial bodies that might harbor life. After years of meticulous research and countless hours of obs'

In [113]:
# If the transcript is too long, the model will not be able to process it.

try:
    chain.invoke({
        "context": artificialus,
        "question": "what is on artificialus gaia terra?"
    })
except Exception as e:
    print(e)

In [114]:
# Splitting the text into smaller chunks

from langchain_community.document_loaders import TextLoader

loader = TextLoader("Artificialus_Gaia_Terra_Simple_Format_CP1252.txt")
text_documents = loader.load()
text_documents

[Document(page_content='# Discovery and Exploration of Artificialus Gaia Terra\n\n## Introduction: The Discovery of Artificialus Gaia Terra\n\nIn the year 2045, an international team of astronomers and astrophysicists embarked on a groundbreaking mission to explore the uncharted regions of the galaxy. Utilizing the latest advancements in deep space telescopy and interstellar probe technology, the team aimed to uncover new celestial bodies that might harbor life. After years of meticulous research and countless hours of observation, their efforts culminated in the discovery of a new planet, which they named Artificialus Gaia Terra (AGT).\n\nArtificialus Gaia Terra was first detected by the Jameson Deep Space Array, a network of powerful telescopes positioned in the outer reaches of our solar system. The initial data indicated that AGT was located approximately 50 light-years away from Earth, orbiting a star similar in size and composition to our Sun. The planet\'s orbital path placed it

In [115]:
#Splitting the text into smaller chunks
#Overlapping chunks a bit to provide the model with more context.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=25)
text_splitter.split_documents(text_documents)[:5]

[Document(page_content='# Discovery and Exploration of Artificialus Gaia Terra\n\n## Introduction: The Discovery of Artificialus Gaia Terra', metadata={'source': 'Artificialus_Gaia_Terra_Simple_Format_CP1252.txt'}),
 Document(page_content='In the year 2045, an international team of astronomers and astrophysicists embarked on a groundbreaking mission to explore the uncharted regions of the galaxy. Utilizing the latest advancements in deep space telescopy and interstellar probe', metadata={'source': 'Artificialus_Gaia_Terra_Simple_Format_CP1252.txt'}),
 Document(page_content='and interstellar probe technology, the team aimed to uncover new celestial bodies that might harbor life. After years of meticulous research and countless hours of observation, their efforts culminated in the discovery of a new planet, which they', metadata={'source': 'Artificialus_Gaia_Terra_Simple_Format_CP1252.txt'}),
 Document(page_content='a new planet, which they named Artificialus Gaia Terra (AGT).', metadata

In [116]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [117]:
#Finding the relevant chunks for the question
#Using OpenAI embeddings to find the most relevant chunks for the question.


from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("What is on Artificialus Gaia Terra?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[0.012937129376419826, -0.024361308478119677, -0.008678452976855622, -0.010611668148098155, -0.0038734338881927124, 0.016194175805211694, -0.05483045387136096, -0.007095458397494795, -0.010989905251116865, -0.013329375433498963]


In [118]:
# Testing the embeddings on some example sentences

sentence1 = embeddings.embed_query("what is Artificialus Gaia Terra?")
sentence2 = embeddings.embed_query("What is the purpose of the Artificialus Gaia Terra?")

In [119]:
# Computing the cosine similarity between the query and the sentences

from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9764071802825847, 0.9369373139700932)

In [120]:
# Setting up a test vector store with the text chunks with the embeddings

from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstoreTest = DocArrayInMemorySearch.from_texts(
    [
        "Artificialus Gaia Terra is a newly discovered planet.",
        "The planet is located in the Andromeda galaxy.",
    ],
    embedding=embeddings,
)

We can now query the vector store to find the most similar embeddings to a given query:

In [121]:
vectorstoreTest.similarity_search_with_score(query="What is on Artificialus Gaia Terra?", k=3)

[(Document(page_content='Artificialus Gaia Terra is a newly discovered planet.'),
  0.9119137523144009),
 (Document(page_content='The planet is located in the Andromeda galaxy.'),
  0.7778518144768889)]

In [122]:
# Connecting the vector store to the model

# We configure a retriever to use the vector store to find the most relevant chunks for the question.
# Retriever runs similarity search on the vector store and retrieves the most similar chunk to the next step of the chain.

retriever = vectorstoreTest.as_retriever(search_kwargs={"k": 2})
retriever.invoke("What is on Artificialus Gaia Terra?")

[Document(page_content='Artificialus Gaia Terra is a newly discovered planet.'),
 Document(page_content='The planet is located in the Andromeda galaxy.')]

In [123]:
# The prompt expects two parameters, "context" and "question." The retriever is used to find the chunks we'll use as the context to answer the question.

# A map is created with the two inputs by using the [`RunnableParallel`] and [`RunnablePassthrough`] classes. 
# This will allows to pass the context and question to the prompt as a map with the keys "context" and "question."

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever, question=RunnablePassthrough())
setup.invoke("What is on Artificialus Gaia Terra??")

{'context': [Document(page_content='Artificialus Gaia Terra is a newly discovered planet.'),
  Document(page_content='The planet is located in the Andromeda galaxy.')],
 'question': 'What is on Artificialus Gaia Terra??'}

In [140]:
# adding the setup map to the chain

chain = setup | prompt | model | parser
chain.invoke("What is a Luminara Viventis")

'I have no knowledge of that.'

Let's invoke the chain using another example:

In [141]:
chain.invoke("What is a Luminara Viventis?")

'I have no knowledge of that.'

In [142]:
# creating a new vector store with the text chunks from a document
vectorstoreFromDocEmbed = DocArrayInMemorySearch.from_documents(documents, embeddings)

In [143]:
# Setting up a new chain using the vector store and specify the RunnableParallel portion of the chain

chain = (
    {"context": vectorstoreFromDocEmbed.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is a Luminara Viventis")

"The Luminara Viventis is a large, multicellular organism that thrives in the dense forests covering much of AGT's surface. It stands at an average height of 3 meters and possesses a semi-transparent, bioluminescent exoskeleton."

In [128]:
# Setting up pinecone vector store

from langchain_pinecone import PineconeVectorStore

index_name = "rag-index"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [129]:
# Testing the pinecone vector store

pinecone.similarity_search("What is Artificialus Gaia Terra?")[:1]

[]

In [158]:
# Adding the pinecone vector store to the chain

chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("The atmosphere of AGT has higher oxygen levels than Earth's. Discuss how this affects the size and energy efficiency of organisms like the Luminara Viventis.")

'The higher oxygen levels in the atmosphere of AGT contribute significantly to the size and energy efficiency of organisms like the Luminara Viventis. Oxygen is a critical element for the metabolic processes of most aerobic organisms, and higher concentrations of oxygen can enhance these processes. This means that organisms on AGT can support larger body sizes and more complex energy demands.\n\nFor the Luminara Viventis, which stands at an average height of 3 meters, the increased availability of oxygen allows for the development of a larger and more complex multicellular structure. The enhanced oxygen levels enable more efficient respiration and energy production, which is essential for sustaining larger organisms. Additionally, the bioluminescent exoskeleton of the Luminara Viventis might also be an adaptation that benefits from the higher oxygen availability, possibly aiding in more effective energy utilization and environmental interactions.\n\nIn summary, the elevated oxygen leve