In [19]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
import os
from langchain.output_parsers import CommaSeparatedListOutputParser
os.environ["OPENAI_API_KEY"] = 'your-openai-api-key'
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 
from pydantic import BaseModel, Field, conlist
from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever

In [20]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)

with open('transcript.txt','r') as f:
    transcript = f.read()
    texts = text_splitter.split_text(transcript)

In [57]:
# Define a new Pydantic model with field descriptions and tailored for Twitter.
class TranscriptionChapter(BaseModel):
    summarization: str = Field(description="A short summarization of the transcript")
    number_of_chapters: int = Field(description="Number of chapters of main contents in this transcript")
    chapter: List[str] = Field(description=f"List of {number_of_chapters} short chapter name of main contents and the sentence that each chapter start with in the transcript, using this format 'Chapter name:Sentence")
    sentences: List[str] = Field(description=f"List of {number_of_chapters} main contents in the transcript and the first sentence each {chapter} begin with")

In [58]:
#Output parser
output_parser = PydanticOutputParser(pydantic_object=TranscriptionChapter)
# Prompt
prompt = PromptTemplate(
                        template='''
                        As a content curator, summarize the content in the following transcript with in chapter with sentences that each chapter begin in using this {format_instructions}
                        {transcript}
                        ''',
                        input_variables=['transcript'],
                        partial_variables={"format_instructions": output_parser.get_format_instructions()})

llm = ChatOpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])
# Chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Run



In [None]:
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)

retriever = MultiQueryRetriever.from_llm(
    retriever=docsearch.as_retriever(), llm=llm
)

docs = retriever.get_relevant_documents(query='Summarize the main contents concise and detailed')

In [60]:

print(docs)


message = prompt.format_prompt(transcript= "\n---\n".join([d.page_content for d in docs]))

print(message)
result = llm_chain(message.to_string())
result
parsed = output_parser.parse(result['text'])

[Document(page_content="424\n00:00:1629\nareas of higher thought, areas for empathy or other sort of aspects of everything from personality\n425\n00:00:1634\nto processing.\n426\n00:00:1636\nDo you think that the transformer architectures are the main thing that will just keep going\n427\n00:00:1640\nand get us there?\n428\n00:00:1641\nDo you think we'll need other architectures over time?\n429\n00:00:1643\nSo I have to, I understand precisely what you're saying and I have to answer to this question.\n430\n00:00:1650\nThe first is that in my opinion, the best way to think about the question of architecture\n431\n00:00:1655\nis not in terms of a binary, is it enough?\n432\n00:00:1659\nBut how much effort, what will be the cost of using this particular architecture?\n433\n00:00:1667\nLike at this point, I don't think anyone doubts that the transformer architecture can do\n434\n00:00:1672\namazing things, but maybe something else, maybe some modification could have some computer\n435\n00:

In [62]:
print(f'''
        {parsed.summarization}
        {parsed.number_of_chapters}
        {parsed.chapter}
        {parsed.sentences}
    ''')


        The transcript discusses various topics related to transformer architectures, reliability of models, the role of open source in the ecosystem, and the potential of super intelligent data centers. It also mentions the value of larger models for unlocking new applications and the importance of considering computer efficiency. The discussion highlights the complexity of the human brain's specialized regions.
        5
        ['Chapter 1: areas of higher thought, areas for empathy or other sort of aspects of everything from personality to processing.', 'Chapter 2: The definition of reliability and its similarity to the self-driving situation.', 'Chapter 3: The role of open source in the ecosystem and the demand for open source models.', 'Chapter 4: The potential of super intelligent data centers and the unpredictability of their impact.', 'Chapter 5: The value of larger models for unlocking new and valuable applications and the consideration of computer efficiency.']
        ['Ch