In [10]:
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from chromadb.config import Settings

chroma_client = chromadb.HttpClient(host='localhost', port = 8083, settings=Settings(allow_reset=True, anonymized_telemetry=False))

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

collection = chroma_client.get_or_create_collection(name="transcripts_mililm_l6_v2", embedding_function=sentence_transformer_ef)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [11]:
results = collection.query(
    query_texts=["The microsoft transcript"],
    n_results=10,
    where={"symbol": "MSFT"}
)
results

{'ids': [['2022Q1MSFT',
   '2023Q2MSFT',
   '2022Q3MSFT',
   '2023Q3MSFT',
   '2023Q4MSFT',
   '2022Q2MSFT',
   '2023Q1MSFT',
   '2022Q4MSFT']],
 'distances': [[1.3223153352737427,
   1.3265554904937744,
   1.3389511108398438,
   1.3522377014160156,
   1.35268235206604,
   1.3562284708023071,
   1.3590455055236816,
   1.3921499252319336]],
 'embeddings': None,
 'metadatas': [[{'quarter': 1, 'symbol': 'MSFT', 'year': 2022},
   {'quarter': 2, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 3, 'symbol': 'MSFT', 'year': 2022},
   {'quarter': 3, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 4, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 2, 'symbol': 'MSFT', 'year': 2022},
   {'quarter': 1, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 4, 'symbol': 'MSFT', 'year': 2022}]],
 'documents': [["Operator: Greetings and welcome to the Microsoft Fiscal Year 2022, First-Quarter earnings conference call.  As a reminder, this conference is being recorded. It is now my pleasure to introduce your

### Summarization

#### 1. ```stuffing```

Stuffing is the simplest way to pass data to language model. 'Stuffs' text into a prompt in a way that all of the relevant information can be processed by the model

In [12]:
import pandas as pd
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate

In [13]:
prompt_template = """Write a concise summary of the following text delimited by triple backquotes.
              Return your response in bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
  """

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

In [14]:
local_path = (
    "../llm_models/gpt4all-falcon-q4_0.gguf"  # replace with your desired local file path
)

In [15]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [16]:
stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)

In [17]:
results

{'ids': [['2022Q1MSFT',
   '2023Q2MSFT',
   '2022Q3MSFT',
   '2023Q3MSFT',
   '2023Q4MSFT',
   '2022Q2MSFT',
   '2023Q1MSFT',
   '2022Q4MSFT']],
 'distances': [[1.3223153352737427,
   1.3265554904937744,
   1.3389511108398438,
   1.3522377014160156,
   1.35268235206604,
   1.3562284708023071,
   1.3590455055236816,
   1.3921499252319336]],
 'embeddings': None,
 'metadatas': [[{'quarter': 1, 'symbol': 'MSFT', 'year': 2022},
   {'quarter': 2, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 3, 'symbol': 'MSFT', 'year': 2022},
   {'quarter': 3, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 4, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 2, 'symbol': 'MSFT', 'year': 2022},
   {'quarter': 1, 'symbol': 'MSFT', 'year': 2023},
   {'quarter': 4, 'symbol': 'MSFT', 'year': 2022}]],
 'documents': [["Operator: Greetings and welcome to the Microsoft Fiscal Year 2022, First-Quarter earnings conference call.  As a reminder, this conference is being recorded. It is now my pleasure to introduce your

In [18]:
first_transcript = results['documents'][0][-3:]
first_transcript

['Operator: Greetings, and welcome to the Microsoft Fiscal Year 2022 Second Quarter Earnings Conference Call. At this time, all participants are in a listen-only mode. A question-and-answer session will follow the formal presentation. [Operator Instructions] As a reminder, this conference is being recorded. It is now my pleasure to introduce your host, Brett Iversen, General Manager, Investor Relations. Thank you. You may begin.\nBrett Iversen: Good afternoon, and thank you for joining us today. On the call with me are Satya Nadella, Chairman and Chief Executive Officer; Amy Hood, Chief Financial Officer; Alice Jolla, Chief Accounting Officer; and Keith Dolliver, Deputy General Counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks during today\'s call and provides the reconciliation of differences between GAAP and non-GAAP financial measures. Unless otherwise s

In [19]:
first_transcript_meta = results['metadatas'][0][-3:]

In [20]:
first_transcript

['Operator: Greetings, and welcome to the Microsoft Fiscal Year 2022 Second Quarter Earnings Conference Call. At this time, all participants are in a listen-only mode. A question-and-answer session will follow the formal presentation. [Operator Instructions] As a reminder, this conference is being recorded. It is now my pleasure to introduce your host, Brett Iversen, General Manager, Investor Relations. Thank you. You may begin.\nBrett Iversen: Good afternoon, and thank you for joining us today. On the call with me are Satya Nadella, Chairman and Chief Executive Officer; Amy Hood, Chief Financial Officer; Alice Jolla, Chief Accounting Officer; and Keith Dolliver, Deputy General Counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks during today\'s call and provides the reconciliation of differences between GAAP and non-GAAP financial measures. Unless otherwise s

In [21]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

In [22]:
#first_transcript = Document(page_content=first_transcript)

texts = text_splitter.create_documents(first_transcript, metadatas=first_transcript_meta)
texts = text_splitter.split_documents(texts)


In [23]:
type(texts)

list

In [34]:
texts[20].page_content

'strategic investments we are making to capture the tremendous opportunities ahead of us. In closing, digital technologies are increasingly essential to empowering every person and organization on the planet to achieve more and we are well positioned with innovative, high value products. Our diverse, yet connected portfolio of solutions spans end markets, customer sizes and business models, uniquely enabling us to deliver long-term revenue and profit growth.  With that, Brett, let’s go to Q&A'

In [25]:
texts

[Document(page_content='Operator: Greetings, and welcome to the Microsoft Fiscal Year 2022 Second Quarter Earnings Conference Call. At this time, all participants are in a listen-only mode. A question-and-answer session will follow the formal presentation. [Operator Instructions] As a reminder, this conference is being recorded. It is now my pleasure to introduce your host, Brett Iversen, General Manager, Investor Relations. Thank you. You may begin.', metadata={'quarter': 2, 'symbol': 'MSFT', 'year': 2022}),
 Document(page_content="Brett Iversen: Good afternoon, and thank you for joining us today. On the call with me are Satya Nadella, Chairman and Chief Executive Officer; Amy Hood, Chief Financial Officer; Alice Jolla, Chief Accounting Officer; and Keith Dolliver, Deputy General Counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks during today's call and pro

In [26]:
try:
    print(stuff_chain.run(texts))
except Exception as e:
    print(
        "The code failed since it won't be able to run inference on such a huge context and throws this exception: ",
        e,
    )

ERROR: The prompt size exceeds the context window size and cannot be processed.ERROR: The prompt size exceeds the context window size and cannot be processed.


LLaMA ERROR: The prompt is 36015 tokens and the context window is 2048!


#### 2. ```map_reduce```

Let’s unpack the map reduce approach. For this, we’ll first map each document to an individual summary using an ```LLMChain```. Then we’ll use a ```ReduceDocumentsChain``` to combine those summaries into a single global summary.

In [56]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.llm import LLMChain

# Map
map_template = """The following is a part of a transcript for company {symbol}. It will contain the financial performance for quarter {quarter} in {year} and the outlook for next quarter.
The documents will be seperated per quarter.
{page_content}
Based on this set of docs, please identify:
- the main themes
- biggest challenges
- biggest success
Helpful Answer:"""

map_prompt = PromptTemplate(input_variables=['symbol', 'quarter', 'year', 'page_content'], template=map_template)

#map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)


In [65]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.llm import LLMChain

# Map
map_template = """The following is a part of a transcript for a company. It will contain the financial performance for a specific quarter in a specific year and the outlook for next quarter.
The documents will be seperated per quarter.
{page_content}
Based on this set of docs, please identify:
- the main themes
- biggest challenges
- biggest success
Helpful Answer:"""

map_prompt = PromptTemplate(input_variables=['page_content'], template=map_template)

#map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [66]:
from langchain import hub

#map_prompt = hub.pull("rlm/map-prompt")
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [67]:
# Reduce
reduce_template = """The following is set of summaries for company {symbol} for quarter {quarter} in {year}.
{page_content}
Take these and distill it into a evaluation of the main themes, challenges and successes throughout the different quarters.
Helpful Answer:"""

reduce_prompt = PromptTemplate(input_variables=['symbol', 'quarter', 'year', 'page_content'], template=reduce_template)


#reduce_prompt = PromptTemplate.from_template(reduce_template)

In [68]:
# Reduce
reduce_template = """The following is set of summaries for a company for a specific quarter and year.
{page_content}
Take these and distill it into a evaluation of the main themes, challenges and successes throughout the different quarters.
Helpful Answer:"""

reduce_prompt = PromptTemplate(input_variables=['page_content'], template=reduce_template)


#reduce_prompt = PromptTemplate.from_template(reduce_template)

In [69]:
# Note we can also get this from the prompt hub, as noted above
#reduce_prompt = hub.pull("rlm/map-prompt")

In [70]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain


# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="page_content",
    #metadata={'symbol':'symbol', 'quarter':'quarter', 'year':'year'}
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=2000,
    #metadata={'symbol':'symbol', 'quarter':'quarter', 'year':'year'}
)

In [71]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="page_content",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
    #metadata={'symbol':'symbol', 'quarter':'quarter', 'year':'year'}
)

split_docs = text_splitter.split_documents(texts)

In [72]:
print(map_reduce_chain.run(split_docs))

 Based on the given documents, the main themes are financial performance for a specific quarter in a specific year and outlook for next quarter. The biggest challenges faced by the company include maintaining profitability, increasing market share, and managing costs. The biggest success is not mentioned in the given documents.
The main themes from the transcript are:

1. Microsoft's financial performance for Q2 2021, including revenue growth and increased cloud adoption.
2. Outlook for Q3 2021, with a focus on continued growth in cloud services and gaming.
3. Updates on key initiatives such as Teams, Surface, and Xbox.
4. Discussion of the impact of COVID-19 on Microsoft's business, including supply chain disruptions and changes in customer behavior.
5. Emphasis on the importance of innovation and R&D investments for future growth.
6. Updates on the company's sustainability initiatives, including reducing carbon emissions and promoting diversity and inclusion.
7. Discussion of the imp

In [None]:
from langchain.chains import VectorDBQA
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
client_settings = chromadb.Settings(chroma_server_host="localhost", chroma_server_http_port='8083')
db = Chroma(client=chroma_client, embedding_function=embeddings, collection_name='transcripts_mililm_l6_v2')

In [None]:
db.get(where={'symbol':'MSFT'})

In [None]:
search = db.as_retriever(search_kwargs={"k": 5, 'filter': {'symbol': 'TTD'}})
search.get_relevant_documents('transcripts for the trade desk')

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate

In [None]:
local_path = (
    "../llm_models/gpt4all-falcon-q4_0.gguf"  # replace with your desired local file path
)

In [None]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [None]:
template = """
    You will be provided with multiple documents from the same company. \
    Your task is to define a topic title that is a good representation of all the listed documents, give a small summary in 20 words. \
    {context}
    SUMMARY:"""

prompt = PromptTemplate(
input_variables=["context"], template=template)

In [None]:
#retrieval 
from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 5, 'filter':{'symbol':'TTD'}}),
    return_source_documents=True,
    verbose=False,
    chain_type_kwargs={
        "verbose": False,
        "prompt": prompt
    }
)

In [None]:
qa({'query':'summarize the text'})

In [None]:
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
client_settings = chromadb.Settings(chroma_server_host="localhost", chroma_server_http_port='8083')
vectordb = Chroma(client_settings=client_settings, embedding_function=embeddings, collection_name='transcripts_mililm_l6_v2')

In [None]:
qa.run('give the last transcripts on the trade desk')

In [None]:
search = db.as_retriever(search_kwargs={"k": 5})
search.get_relevant_documents('')

In [None]:
!pwd

In [None]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [None]:
template = """
    You will be provided with multiple documents from the same company. \
    Your task is to define a topic title that is a good representation of all the listed documents, give a small summary in 20 words. \
    {documents}
    SUMMARY:"""

prompt = PromptTemplate(
input_variables=["documents"], template=template)

In [None]:
from langchain.chains.llm import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

llm_chain = LLMChain(llm=llm, prompt=prompt)
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

In [None]:
search = db.as_retriever(search_kwargs={"k": 5, 'filter':{'symbol':'TTD'}})
chain = load_summarize_chain(llm_chain, chain_type="stuff")

In [None]:
summary = chain.run(input_documents=search)

In [None]:
#retrieval 
from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 5, 'filter':{'symbol':'TTD'}}),
    return_source_documents=True,
    verbose=False,
    chain_type_kwargs={
        "verbose": False,
        "prompt": prompt
    }
)

In [None]:
qa("Look for Microsoft transcripts")

In [None]:
sub_set = collection.peek(10)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [None]:
sub_set.keys()

In [None]:
len(text_splitter.create_documents(sub_set['documents'][0]))

In [None]:
ls = []

for x in sub_set['documents']:
    docs = [Document(page_content=x) for x in text_splitter.create_documents(x)]
    ls.append(docs)

In [None]:
ids = sub_set['ids']

In [None]:
metadatas = sub_set['metadatas']

In [None]:
chroma_client = chromadb.Client()
client = chromadb.PersistentClient(path="../chroma_db/")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [None]:
collection = chroma_client.create_collection(name="split_test_mililm_l6_v2", embedding_function=sentence_transformer_ef)

In [None]:
collection.add(
    documents=ls,
    metadatas=metadatas,
    ids=ids)