# RAG Student Ambassador Training

## Step 0) Handle Imports

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings import CohereEmbeddings
from langchain_pinecone import PineconeVectorStore

## Step 1) Create Pinecone Index

In [18]:
# Create Pinecone Instance
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

# See if index already exists
index_name = "cohere-langchain-test"

existing_indexes = [
    index["name"] for index in pc.list_indexes()
]
index_exists = index_name in existing_indexes

In [19]:
# Create an Index if it does not exist
if not index_exists:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-2'
        )
    )

## Step 2) Initialize `CohereEmbeddings` and Vector Store

In [21]:
embeddings_model = CohereEmbeddings(
    model="embed-english-light-v3.0",
    cohere_api_key=os.environ.get("COHERE_API_KEY"),
)

vectorstore_a = PineconeVectorStore(
    embedding=embeddings_model,
    index_name=index_name
)

## Step 3) Declare RAG Documents

In [22]:
# vectorstore_a.add_texts(
#     [
#         # USC being good
#         "USC is very good for its central location and access to cultural centers like Korea Town"
#         "USC is very warm during the summer",
#         "USC has the best College of Letter's of Arts and Sciences called Dornsife Viterbi",
#         "USC is known for being bad because it has many hard linguistics and CS classes",
#         # "Although USC is known as the University of Spoiled Children, students 2/3 of students receive some form of financial aid"
#         # "Although USC is known as the University of Spoiled Children, 1/5 of all students are first generation"
#     ]
# )

# Add Sentences about USC regarding different topics:

# USC is good
vectorstore_a.add_texts(
    [
        "USC is very good for its central location and access to cultural centers like Korea Town"
        "USC is very warm during the summer",
    ]
)

# USC is bad
vectorstore_a.add_texts(
    [
        "USC is known for being bad because it has many hard linguistics and CS classes",
    ]
)

# USC Facts
vectorstore_a.add_texts(
    [
        "USC has the best College of Letter's of Arts and Sciences called Dornsife Viterbi",
    ]
)

# USC Ambassador Sentences
vectorstore_a.add_texts(
    [
        "Although USC is known as the University of Spoiled Children, students 2/3 of students receive some form of financial aid"
        "Although USC is known as the University of Spoiled Children, 1/5 of all students are first generation"
    ]
)

ServiceException: (500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 04 Mar 2024 16:13:11 GMT', 'Content-Type': 'application/json', 'Content-Length': '202', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '30026', 'x-pinecone-request-id': '7330749176162217140', 'x-envoy-upstream-service-time': '30026', 'server': 'envoy'})
HTTP response body: {"code":2,"message":"pinecone-commons/src/request_log/store/postgres.rs:211 -\u003e An error occurred while trying to contact Postgres: pool timed out while waiting for an open connection","details":[]}


## Step 4) Construct LangChain Components
1) Cohere `Command` model for Question and Answering
2) Pinecone Vector Store Retreiver using Semantic Search
3) Templated prompt with slotted in `question` and `context`

In [3]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_community.llms import Cohere

llm = Cohere(model="command")

retriever = vectorstore_a.as_retriever()

prompt_str = """
Question: {question}

Please answer the question above using only the context provided.
Context: {context}

Answer:
"""

### Step 4a) Use LangChain Expression Language (LCEL) to connect components

In [None]:
prompt =  PromptTemplate.from_template(prompt_str)

retrieve_question_and_context = RunnableParallel({"question": RunnablePassthrough(), "context": retriever})

chain = retrieve_question_and_context | prompt | llm  | StrOutputParser()

## Step 5) Call LCEL Chain slotted with different prompts

In [7]:
out = chain.invoke("Why is USC good?")
print(out)

NameError: name 'chain' is not defined

In [4]:
out = chain.invoke("Why is USC bad?")
print(out)

 Some reasons why some people think USC is bad are because of the difficulty of certain linguistics and CS classes and because of a confusing and complex accounting system. 

Overall, USC is a wonderful school, as it is described as "the best school in the world" and is "known for being awesome because it is very warm during the summer". 


In [6]:
out = chain.invoke("Tell me about USC's Dance Program?")
print(out)

 USC's dance program is the best in the world, according to the speaker. You can study dance at Kauffman, which is better than any other dance school (like Juliard, for example). 

I'm unsure which specific dances are taught at Kauffman, or what the program entails, as this is not mentioned in the context provided. 

I also found a mention of USC's Dornsife Viterbi, which is apparently the best college for Letters of Arts and Sciences. 


## Step 6) Cleanup the RAG Documents as needed (Optional)

In [None]:
# Cleanup
# pc.delete_index(name=index_name)