In [153]:
import os
import getpass
import pandas as pd

from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import DataFrameLoader, JSONLoader, CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings

In [41]:
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

In [139]:
df = pd.read_csv("feed.csv")

loader = DataFrameLoader(df, page_content_column="project_name")
data = loader.load()

text_splitter = TokenTextSplitter(chunk_size=1500, chunk_overlap=0)
documents = text_splitter.split_documents(data)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [178]:
df.shape[0]

503

In [146]:
loader = JSONLoader(
    file_path='feed.json',
    jq_schema='._project[]',
    text_content=False)

data = loader.load()

text_splitter = TokenTextSplitter(chunk_size=1500, chunk_overlap=0)
documents = text_splitter.split_documents(data)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [147]:
# LLM
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

prompt_template = """Use the following pieces of context about projects information to give me recommendations based on my projects interests and tell me what projects are related to my interests. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

My projects interests are the following:
{question}

Answer here:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=db.as_retriever(), chain_type_kwargs=chain_type_kwargs)

query = "My interests are in microplastics, or something related with aquatic species"
qa.run(query)

' Based on your interests, I recommend the Florida Microplastic Awareness Project. This project is actively collecting coastal water samples and filtering them to look for the presence of microplastics, as well as educating stakeholders about the sources of and problems caused by microplastics. This project is focused on ecology and environment, nature and outdoors, and ocean/water and marine.'

In [174]:
prompt_template = """Use the following pieces of context to give me the project description of each project name I will provide to you, each project description should be in a new line. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

The project names of which I need their descriptions are:
{question}

Answer here:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=[
        "context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=db.as_retriever(), chain_type_kwargs=chain_type_kwargs)

response = qa.run("noaa nws skywarn, image detective, ecocast")

In [175]:
print(response)

 NOAA NWS SKYWARN® Weather Spotter Program is a volunteer program that helps to collect weather data and report it to the National Weather Service.
Image Detective is a project that uses artificial intelligence to detect and classify objects in images.
ECOCast is a project that uses satellite data to predict the future of the Earth's ecosystems.


In [176]:
prompt_template = """Use the following pieces of context about projects information to answer the question at the end. 
                If you don't know the answer, just say that you don't know, don't try to make up an answer.

                {context}

                Question: {question}

                Answer here:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=[
        "context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(
    temperature=0), chain_type="stuff", retriever=db.as_retriever(), chain_type_kwargs=chain_type_kwargs)

# How many open science projects currently exist?
response = qa.run("How many projects currently exist?")

In [177]:
response

" I don't know."

In [220]:
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType

In [182]:
from langchain.llms import OpenAI
import pandas as pd

df = pd.read_csv("feed.csv")

In [221]:
agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    df,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
)

In [230]:
agent.memory

In [223]:
prompt_template = """Give me recommendations based on my projects interests and tell me what projects are related to my interests. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

My projects interests are the following:
{question}

Answer here:"""

In [224]:
prompt_template.format(question='My interests are in microplastics, or something related with aquatic species')

"Use the following pieces of context about projects information to give me recommendations based on my projects interests and tell me what projects are related to my interests. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nMy projects interests are the following:\nMy interests are in microplastics, or something related with aquatic species\n\nAnswer here:"

In [225]:
response = agent.run(prompt_template.format(question='My interests are in microplastics, or something related with aquatic species'))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mBased on your interests in microplastics and aquatic species, here are some project recommendations:

1. Project Name: "Plastic Tide"
   Description: This project focuses on monitoring and mapping the distribution of microplastics in coastal areas. Volunteers collect water samples and analyze them for microplastic contamination. The data collected helps researchers understand the extent of the problem and develop strategies to mitigate it.
   Project URL: [Plastic Tide](https://www.plastictide.org/)

2. Project Name: "Citizen Science Microplastics Monitoring"
   Description: This project aims to engage citizen scientists in monitoring microplastic pollution in rivers and lakes. Participants collect water samples, filter them, and analyze the filtered particles for microplastics. The data collected contributes to a larger database on microplastic pollution and helps raise awareness about the issue.
   Project URL: [Citizen Sci

In [226]:
response

'Based on your interests in microplastics and aquatic species, here are some project recommendations:\n\n1. Project Name: "Plastic Tide"\n   Description: This project focuses on monitoring and mapping the distribution of microplastics in coastal areas. Volunteers collect water samples and analyze them for microplastic contamination. The data collected helps researchers understand the extent of the problem and develop strategies to mitigate it.\n   Project URL: [Plastic Tide](https://www.plastictide.org/)\n\n2. Project Name: "Citizen Science Microplastics Monitoring"\n   Description: This project aims to engage citizen scientists in monitoring microplastic pollution in rivers and lakes. Participants collect water samples, filter them, and analyze the filtered particles for microplastics. The data collected contributes to a larger database on microplastic pollution and helps raise awareness about the issue.\n   Project URL: [Citizen Science Microplastics Monitoring](https://www.citizensc

In [229]:
agent.run("What is the start date of the second project you recommended?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df.loc[1, 'start_date']"}`


[0m[36;1m[1;3m2014-10-17[0m[32;1m[1;3mThe start date of the second project is October 17, 2014.[0m

[1m> Finished chain.[0m


'The start date of the second project is October 17, 2014.'

In [193]:
prompt_template = """Based on the information in the dataset, answer the question at the end. 
                If you don't know the answer, just say that you don't know, don't try to make up an answer.

                Question: {question}

                Answer here:"""

prompt = prompt_template.format(question="How many active projects currently exist?")
# How many open science projects currently exist?
response = agent.run(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[df['project_status'] == 'active'].shape[0]"}`


[0m[36;1m[1;3m238[0m[32;1m[1;3mThere are currently 238 active projects in the dataset.[0m

[1m> Finished chain.[0m


In [194]:
print(response)

There are currently 238 active projects in the dataset.


In [None]:
prompt_template = """Give me the project description of each project name I will provide to you, each project description should be in a new line. 
                If you don't know the answer, just say that you don't know, don't try to make up an answer.

                The project names of which I need their descriptions are:
                {project_names}

                Answer here:"""

prompt = prompt_template.format(project_names="noaa nws skywarn, image detective, EcoCast")
# noaa nws skywarn, image detective, EcoCast
response = agent.run(prompt)