# Imports

In [94]:
import pandas as pd
import os
from dotenv import load_dotenv
import openai
import sys
import json
import re

from langchain import PromptTemplate,FewShotPromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import OutputFixingParser
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
#from langchain.chains import VectorDBQA
from langchain.chains import RetrievalQA

from langchain.docstore.document import Document

from langchain.chains import LLMChain, LLMMathChain, TransformChain, SequentialChain
from langchain.callbacks import get_openai_callback

# Load Data

In [3]:
load_dotenv()

True

In [4]:
with open('../data/news.json') as json_file:
    news = json.load(json_file)

# Initialize LLM

In [None]:
max_tokens=256,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0

In [5]:
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0.0)



In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    temperature=0.0,
    model='gpt-3.5-turbo'
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Sentiment Analysis

- need to add document splitting

### 0 Shot Prompt Template

In [6]:
class Article(BaseModel):
    sentiment: str = Field(description="estimated sentiment")
    
    """
    # You can add custom validation logic easily with Pydantic.
    @validator('setup')
    def question_ends_with_question_mark(cls, field):
        if field[-1] != '?':
            raise ValueError("Badly formed question!")
        return field
    """

In [None]:
query = "Extract the sentiment of the provided context. The sentiment can either be 'Positive', 'Negative' or 'Neutral'."

parser = PydanticOutputParser(pydantic_object=Article)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\nQuery: {query}\nContext: {context}\nAnswer:",
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000, chunk_overlap=50)

chunk_sentiments = {}

for article in news["BTC"]["articles"]:

    chunk_sentiments[article["id"]] = []
    
    chunks = text_splitter.split_text(article["content"])

    for chunk in chunks:
    
        _input = prompt.format_prompt(query=query, context=chunk)

        output = llm(_input.to_string())

        try:

            parsed_output = parser.parse(output)

        except OutputParserException as e:

            print("Fixing output")
            new_parser = OutputFixingParser.from_llm(parser=parser, llm=llm)
            parsed_output = new_parser.parse(output)

        chunk_sentiments[article["id"]].append(parsed_output)

    break

In [None]:
chunk_sentiments

### Few Shot Prompt Template

- add LengthBased selector or SemanticSimilarity selector

In [None]:
query = "Extract the sentiment of the provided context. The sentiment can either be 'Positive', 'Negative' or 'Neutral'."

parser = PydanticOutputParser(pydantic_object=Article)

# context is too long
cutoff = 2000

examples = [
    {
        "query": "Extract the sentiment of the provided context.",
        "context": f'{news["BTC"]["articles"][14]["content"][:cutoff]}',
        "answer": "Positive"
    }, {
        "query": "Extract the sentiment of the provided context.",
        "context": f'{news["BTC"]["articles"][4]["content"][:cutoff]}',
        "answer": "Negative"
    }, {
        "query": "Extract the sentiment of the provided context.",
        "context": f'{news["BTC"]["articles"][5]["content"][:cutoff]}',
        "answer": "Neutral"
    }
]

example_template = """
Query: {query}
Context: {context}
Answer: {answer}
"""

example_prompt = PromptTemplate(
    input_variables=["query", "context", "answer"],
    template=example_template
)

prefix = """The following examples show extracted sentiment from
cryptocurrency news articles (context). The sentiment can either be 'positive',
'negative' or 'neutral'. Here are some examples: 
"""

suffix = """
Now answer the following user query. {format_instructions}\n
Query: {query}
Context: {context}
Answer:"""

prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
    example_separator="\n"
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000, chunk_overlap=50)

chunk_sentiments = {}

for article in news["BTC"]["articles"]:

    chunk_sentiments[article["id"]] = []
    
    chunks = text_splitter.split_text(article["content"])
    
    for chunk in chunks:
        
        _input = prompt.format_prompt(query=query, context=chunk)

        output = llm(_input.to_string())

        try:

            parsed_output = parser.parse(output)

        # need to import OutputParserException class
        except Exception as e:

            print("Fixing output")
            new_parser = OutputFixingParser.from_llm(parser=parser, llm=llm)
            parsed_output = new_parser.parse(output)
            
        chunk_sentiments[article["id"]].append(parsed_output)

    break

In [None]:
chunk_sentiments

### Few Shot Prompt Template with Semantic Similarity example selector - NOT for sentiment analysis (examples don't bring diversity)

- with gpt-3.5-turbo: context is too small with 3 examples --> need to chunk them or truncate

In [49]:
with open("../data/examples.json", "r") as f:
    examples = json.load(f)
    
examples_doc = [f"""Query: {x['query']}\nContext: {x['context']}\n\nAnswer: {x['answer']}""" for x in examples]
examples_doc = [Document(page_content=x) for x in examples_doc]

In [57]:
query = "Extract the sentiment of the provided context. The sentiment can either be 'Positive', 'Negative' or 'Neutral'."

example_template = """
Query: {query}
Context: {context}
Answer: {answer}
"""

example_prompt = PromptTemplate(
    input_variables=["query", "context", "answer"],
    template=example_template
)

prefix = """The following examples show extracted sentiment from
cryptocurrency news articles (context). The sentiment can either be 'positive',
'negative' or 'neutral'. Here are some examples: 
"""

suffix = """
Now answer the following user query. {format_instructions}\n
Query: {query}
Context: {context}
Answer:"""


embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(examples_doc, embeddings)

example_selector = SemanticSimilarityExampleSelector.from_examples(
    examples, 
    embeddings, 
    Chroma, 
    # This is the number of examples to produce.
    k=2
)

parser = PydanticOutputParser(pydantic_object=Article)

similar_prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix, 
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
    example_separator="\n"
)

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.
Exiting: Cleaning up .chroma directory
Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.
Exiting: Cleaning up .chroma directory


In [58]:
context = news["BTC"]["articles"][0]["content"]

In [61]:
_input = similar_prompt.format_prompt(query=query, context=context[:-2000])

In [63]:
print(_input.to_string())

The following examples show extracted sentiment from
cryptocurrency news articles (context). The sentiment can either be 'positive',
'negative' or 'neutral'. Here are some examples: 


Query: What is the outlook for Bitcoin's market price?
Context: Key Insights:

The FTX collapse caused a record-breaking volume of BTC to move from centralized exchanges into self-custodial wallets, leading to a decrease in BTC total exchange balance to levels unseen since April 2018.
Bitcoin experienced an abnormally low correlation to equities this quarter while realized volatility continued to trend downwards.
On-chain activity ticked upwards this quarter with active addresses growing 2% QoQ and the number of transactions recorded reaching a new high for 2022, likely attributed to the FTX crisis.
Numerous public miners filed for bankruptcy or underwent significant restructurings during the quarter as their profit margins continued to be squeezed by the combination of rising hashrate, elevated power pr

In [72]:
print(llm(similar_prompt.format(query=query, context=context)))

InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 13491 tokens. Please reduce the length of the messages.

# Chaining with context cleaning (and text splitter)

- clean all context articles

In [None]:
def count_tokens(chain, query):
    with get_openai_callback() as cb:
        result = chain.run(query)
        print(f'Spent a total of {cb.total_tokens} tokens')

    return result

def transform_func(inputs: dict) -> dict:
    text = inputs["text"]
    
    # replace multiple new lines and multiple spaces with a single one
    text = re.sub(r'(\r\n|\r|\n){2,}', r'\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    return {"output_text": text}

In [None]:
clean_extra_spaces_chain = TransformChain(input_variables=["text"], output_variables=["output_text"], transform=transform_func)

query = "Extract the sentiment of the provided context. The sentiment can either be 'Positive', 'Negative' or 'Neutral'."

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\nContext: {output_text}\nQuery: {query}\nAnswer:",
    input_variables=["query", "output_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

sentiment_extractor_chain = LLMChain(llm=llm, prompt=prompt, output_key='sentiment')

sequential_chain = SequentialChain(chains=[clean_extra_spaces_chain, sentiment_extractor_chain],
                                   input_variables=['query', 'text'], output_variables=['sentiment'])


text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000, chunk_overlap=50)

chunk_sentiments = {}

for article in news["BTC"]["articles"]:

    chunk_sentiments[article["id"]] = []
    
    chunks = text_splitter.split_text(article["content"])

    for chunk in chunks:

        output = count_tokens(sequential_chain, {'text': chunk, 'query': query})

        try:

            parsed_output = parser.parse(output)

        # need to import OutputParserException
        except Exception as e:

            print("Fixing output")
            new_parser = OutputFixingParser.from_llm(parser=parser, llm=llm)
            parsed_output = new_parser.parse(output)

        chunk_sentiments[article["id"]].append(parsed_output)
        
    break

In [None]:
chunk_sentiments

# Content Extraction

# Semantic Search QA

- try loading custom embeddings for QA

In [82]:
docs = [x["content"] for x in news["BTC"]["articles"]]
docs = [Document(page_content=x) for x in docs]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [96]:
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(texts, embeddings)

# try with map_reduce
qa = VectorDBQA.from_chain_type(llm=llm, chain_type="stuff", vectorstore=vectordb)
#qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", vectorstore=vectordb)

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.
Exiting: Cleaning up .chroma directory


In [88]:
query = "What is the sentiment towards Bitcoin?"
qa.run(query)

'The sentiment towards Bitcoin is mixed, with challenges and concerns such as environmental impact and mining difficulties, but also positive developments and potential for long-term investor confidence. Overall, it is seen as a distributed consensus-based, censorship-resistant, permissionless, peer-to-peer payment settlement network with stable monetary policy and store-of-value properties.'

In [97]:
match = 'The sentiment towards Bitcoin is mixed, with challenges and concerns such as environmental impact and mining difficulties, but also positive developments and potential for long-term investor confidence. Overall, it is seen as a distributed consensus-based, censorship-resistant, permissionless, peer-to-peer payment settlement network with stable monetary policy and store-of-value properties.'

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

from langchain.document_loaders import TextLoader
loader = TextLoader('../../state_of_the_union.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.

qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever())

query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)
