In [6]:
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from chromadb.config import Settings

chroma_client = chromadb.HttpClient(host='localhost', port = 8083, settings=Settings(allow_reset=True, anonymized_telemetry=False))

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

collection = chroma_client.get_or_create_collection(name="transcripts_mililm_l6_v2", embedding_function=sentence_transformer_ef)


In [7]:
#query one document
results = collection.query(
    query_texts=["tesla"],
    n_results=1,
    where={"symbol": 'ABNB'},
    include=['metadatas', 'documents', 'embeddings']
)

In [8]:
results

{'ids': [[]],
 'distances': None,
 'embeddings': [[]],
 'metadatas': [[]],
 'documents': [[]],
 'uris': None,
 'data': None}

In [4]:
results['ids']

[[]]

In [5]:
document = results['documents'][0][0]

IndexError: list index out of range

In [25]:
document

'Operator: Good afternoon and thank you for joining Airbnb\'s Earnings Conference Call for the Third Quarter of 2023. As a reminder, this conference call is being recorded and will be available for replay from the Investor Relations section of Airbnb\'s website following this call. I will now hand the call over to Elli Mertz, VP of Finance. Please go ahead.\nEllie Mertz: Thank you. Good afternoon and welcome to Airbnb\'s third quarter of 2023 earnings call. Thank you for joining us today. On the call today, we have Airbnb\'s Co-Founder and CEO, Brian Chesky and our Chief Financial Officer, Dave Stephenson. Earlier today, we issued a shareholder letter with our financial results and commentary for our third quarter of 2023. These items were also posted on the Investor Relations section of Airbnb\'s website. During the call, we\'ll make brief opening remarks and then spend the remainder of time on Q&A. Before I turn it over to Brian, I would like to remind everyone that we\'ll be making 

In [79]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)

texts = text_splitter.create_documents([document])
split_texts = text_splitter.split_documents(texts)


In [80]:
split_ls = [x.page_content for x in split_texts]

In [81]:
split_ls

["Operator: Good afternoon and thank you for joining Airbnb's Earnings Conference Call for the Third Quarter of 2023. As a reminder, this conference call is being recorded and will be available for replay from the Investor Relations section of Airbnb's website following this call. I will now hand the call over to Elli Mertz, VP of Finance. Please go ahead.",
 "Ellie Mertz: Thank you. Good afternoon and welcome to Airbnb's third quarter of 2023 earnings call. Thank you for joining us today. On the call today, we have Airbnb's Co-Founder and CEO, Brian Chesky and our Chief Financial Officer, Dave Stephenson. Earlier today, we issued a shareholder letter with our financial results and commentary for our third quarter of 2023. These items were also posted on the Investor Relations section of Airbnb's website. During the call, we'll make brief opening remarks and then spend the remainder of time on Q&A. Before I turn it over to Brian, I would like to remind everyone that we'll be making for

In [83]:
len(split_ls)

72

In [84]:
from transformers import pipeline
from tqdm import tqdm

summ_ls = []

for sub in tqdm(split_ls):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    sum = summarizer(sub, max_length=150, min_length=30, do_sample=False)
    summ_ls.append(sum)
    #print(sum)

  0%|          | 0/72 [00:00<?, ?it/s]Your max_length is set to 150, but your input_length is only 76. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
  3%|▎         | 2/72 [00:24<14:43, 12.62s/it]Your max_length is set to 150, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
 10%|▉         | 7/72 [01:18<11:14, 10.38s/it]Your max_length is set to 150, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)
 11%|█         | 8/72 [01:32<12:16, 11.50s/it]Your max_length is set to 150, but your input_length is only 140. Since thi

In [102]:
summ_ls

[[{'summary_text': "This conference call is being recorded and will be available for replay from the Investor Relations section of Airbnb's website following this call. I will now hand the call over to Elli Mertz, VP of Finance. Please go ahead."}],
 [{'summary_text': "Ellie Mertz: Thank you. Good afternoon and welcome to Airbnb's third quarter of 2023 earnings call. On the call today, we have Airbnb's Co-Founder and CEO, Brian Chesky and our Chief Financial Officer, Dave Stephenson. During the call, we'll make brief opening remarks and then spend the remainder of time on Q&A."}],
 [{'summary_text': 'During this call, we will discuss some non-GAAP financial measures. These measures are not intended to be a substitute for our GAAP results. You should be aware that these statements should be considered estimates only.'}],
 [{'summary_text': 'Brian Chesky: Q3 was another strong quarter for Airbnb. We had over 113 million Nights and Experiences Booked. Revenue of $3.4 billion grew 18% year

In [106]:
summarized_text = " ".join([x[0]['summary_text'] for x in summ_ls])

In [110]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

texts = text_splitter.create_documents([summarized_text])
split_texts = text_splitter.split_documents(texts)

In [1]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.llm import LLMChain

from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate
local_path = (
    "../llm_models/gpt4all-falcon-q4_0.gguf"  # replace with your desired local file path
)

# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token = os.getenv('HUGGINGFACE_TOKEN')
)

OSError: You are trying to access a gated repo.
Make sure to request access at https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct and pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`.

In [None]:
from langchain_core.prompts import PromptTemplate

from langchain.chains.summarize import load_summarize_chain

chain = load_summarize_chain(llm, chain_type="refine", verbose=True)

In [116]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

# Map
map_template = """The following is a set of summarized transcripts
{docs}
Based on this list of docs, generate a summary of the main points discussed.
Don't return conversation between people, just distill the main points.
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [117]:
# Reduce
reduce_template = """The following is set of summaries of transcripts:
{docs}
Take these and distill it into a final, consolidated summary of the main challenges and successes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [118]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain


# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=2000,
)

In [123]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False)

ValidationError: 1 validation error for MapReduceDocumentsChain
token_max
  extra fields not permitted (type=value_error.extra)

In [124]:
result = map_reduce_chain.run(split_texts)

Exception ignored on calling ctypes callback function: <function LLModel._callback_decoder.<locals>._raw_callback at 0x29e4ec160>
Traceback (most recent call last):
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/finance/lib/python3.10/site-packages/gpt4all/pyllmodel.py", line 438, in _raw_callback
    def _raw_callback(token_id: int, response: bytes) -> bool:
KeyboardInterrupt: 




1. Airbnb's Q3 2023 earnings call was held on November 8, 202

In [122]:
result

"\nAirbnb's main challenges include increasing supply and improving search experience, while its main successes include focusing on home rentals, innovating on the platform, and expanding into new markets. The company is also exploring opportunities in other areas like vacation rentals and experiences."