LLM code

In [14]:
import dotenv
from langchain.chat_models import ChatOpenAI

# Load environment variables from the .env file
dotenv.load_dotenv()

chat = ChatOpenAI(
    openai_api_key=os.getenv("OPENAI_API_KEY"),  # Retrieve API key from environment variable
    model='gpt-3.5-turbo'
)

In [15]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

messages = [
    SystemMessage(content="You are a helpful assistant which helps investors find social impact projects."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand what are the corporate social responsibility guidelines for companies with various carbon footprints")
]

In [16]:
res = chat(messages)
res

AIMessage(content='Corporate social responsibility (CSR) guidelines for companies with varying carbon footprints can vary depending on the industry, company size, and geographical location. However, here are some general guidelines that companies can follow to reduce their carbon footprint and engage in responsible business practices:\n\n1. Measure and report carbon emissions: Companies should measure and report their carbon emissions to understand their environmental impact and identify areas for improvement.\n\n2. Set reduction targets: Companies should set ambitious targets to reduce their carbon emissions over time. This can help drive action and demonstrate a commitment to sustainability.\n\n3. Invest in renewable energy: Companies can reduce their carbon footprint by investing in renewable energy sources such as solar or wind power.\n\n4. Implement energy efficiency measures: Companies can improve their energy efficiency by upgrading equipment, optimizing operations, and implemen

In [17]:
print(res.content)


Corporate social responsibility (CSR) guidelines for companies with varying carbon footprints can vary depending on the industry, company size, and geographical location. However, here are some general guidelines that companies can follow to reduce their carbon footprint and engage in responsible business practices:

1. Measure and report carbon emissions: Companies should measure and report their carbon emissions to understand their environmental impact and identify areas for improvement.

2. Set reduction targets: Companies should set ambitious targets to reduce their carbon emissions over time. This can help drive action and demonstrate a commitment to sustainability.

3. Invest in renewable energy: Companies can reduce their carbon footprint by investing in renewable energy sources such as solar or wind power.

4. Implement energy efficiency measures: Companies can improve their energy efficiency by upgrading equipment, optimizing operations, and implementing energy-saving technolo

Pinecone trials

In [18]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="f5fa0fda-521f-4e2b-959f-0ba03ea28de3")

Pinecone G code

In [19]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or "YOUR_API_KEY"

# configure client
pc = Pinecone(api_key="f5fa0fda-521f-4e2b-959f-0ba03ea28de3")


In [20]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [21]:
import time

index_name = 'hackbangalore-rag-trial-01'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

In [22]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [23]:
"""texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed_model.embed_documents(texts)
len(res), len(res[0])"""

"texts = [\n    'this is the first chunk of text',\n    'then another second chunk of text is here'\n]\n\nres = embed_model.embed_documents(texts)\nlen(res), len(res[0])"

In [None]:
"""from datasets import load_dataset

dataset = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)

dataset"""

In [24]:
"""data = dataset.to_pandas()"""

'data = dataset.to_pandas()'

In [None]:
"""import pandas as pd
data=pd.read_csv("/Users/Zaid/Downloads/HackBangalore/Sample_Application_Forms.csv")"""

In [57]:
from tqdm.auto import tqdm  # for progress bar

#data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['chunk'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/49 [00:00<?, ?it/s]

In [28]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

RAG

In [29]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



In [30]:
"""query = "What is so special about Llama 2?"

vectorstore.similarity_search(query, k=3)
#index.query(vector=query,top_k=3,namespace='my_namespace')
"""

'query = "What is so special about Llama 2?"\n\nvectorstore.similarity_search(query, k=3)\n#index.query(vector=query,top_k=3,namespace=\'my_namespace\')\n'

In [31]:
"""from sentence_transformers import SentenceTransformer

# Load the pre-trained sentence encoder
encoder = SentenceTransformer('path/to/encoder/model')

# Encode the text query into a vector
query = "What is so special about Llama 2?"
query_vector = encoder.encode(query)

# Perform the similarity search using the vector representation
results = vectorstore.similarity_search(query_vector, k=3)"""

'from sentence_transformers import SentenceTransformer\n\n# Load the pre-trained sentence encoder\nencoder = SentenceTransformer(\'path/to/encoder/model\')\n\n# Encode the text query into a vector\nquery = "What is so special about Llama 2?"\nquery_vector = encoder.encode(query)\n\n# Perform the similarity search using the vector representation\nresults = vectorstore.similarity_search(query_vector, k=3)'

In [32]:
"""print(results)"""

'print(results)'

In [38]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
query = 'greenbuild ?'

query_vector = embed_model.embed_documents(query)


In [45]:
query_vector

[[-0.015609775419365415,
  -0.014262462656874137,
  -0.002481981521689725,
  -0.02985862772754051,
  -0.03187279169692791,
  0.005661435206541875,
  -0.006250034358999427,
  -0.02342146737207503,
  -0.009553672231181563,
  -0.0300763765429181,
  0.031029020858106094,
  0.02645632367485517,
  0.023054017759524596,
  0.005004790364186114,
  -0.01228232083394388,
  0.014480209609606495,
  0.03200888400804694,
  0.01104388154781878,
  0.0054334809579465425,
  -0.025748643284507156,
  -0.002442855028809134,
  -0.009533258570778231,
  -0.010778502099930237,
  -0.00986668417422629,
  -0.0062466317718246666,
  0.016249407791508682,
  0.0202641291068749,
  -0.019433964891461666,
  0.01279266700064025,
  0.0002740981991740545,
  0.01951562139572022,
  -0.0014238645710804082,
  -0.020808496488705797,
  -0.010758088439526905,
  -0.0181546982845299,
  -0.010513122652041693,
  -0.011567837131891577,
  -0.031627827772087926,
  0.0011780481368015064,
  0.003616650048057517,
  0.003221982764907501,
  0

In [39]:

# Perform the similarity search using the vector representation
"""
results = vectorstore.similarity_search(query_vector, topk=3)"""

'\nresults = vectorstore.similarity_search(query_vector, topk=3)'

In [43]:
index.query(vector=query_vector,top_k=3,namespace='hackbangalore-rag-trial-01')

{'matches': [],
 'namespace': 'hackbangalore-rag-trial-01',
 'usage': {'read_units': 1}}