In [8]:
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import os

text_splitter = RecursiveCharacterTextSplitter(separators=["."], chunk_size=10000, chunk_overlap=500)

with open('transcript_text.txt','r') as f:
    transcript = f.read()
    docs = text_splitter.create_documents([transcript])
    
docs

[Document(page_content="OpenAI, a company that we all know now, but only a year ago was 100 people, is changing the world. Their research is leading the charge to AGI. Since ChatGPT captured consumer attention last November, they show no signs of slowing down. This week, a lot of nice sit down with Ilya Sutskmer, co-founder and chief scientist at OpenAI to discuss the state of AI research, where we'll hit limits, the future of AGI, and what's going to take to reach super alignment. Ilya, welcome to no priors. Thank you. Let's go to be here. Let's start at the beginning. Pre-AlexNet, nothing in deep learning was really working. And then given that environment, you guys took a very unique bet, what motivated you to go in this direction? Indeed, in those dark ages, AI was not an area where people had hope and people who were not accustomed to any kind of success at all. And because there wasn't, there hasn't been any success, there was a lot of debate, and there were different schools of 

In [9]:
llm = ChatOpenAI(temperature=0, openai_api_key='youropenapikey')

num_docs = len(docs)

num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 5 documents and the first one has 2103 tokens


In [11]:
from pydantic import BaseModel, Field, conlist
from typing import List
from langchain.output_parsers import PydanticOutputParser
class MainContent(BaseModel):
    main_content : str = Field(description='Summarize the main concept of the document with a question')
    sentence : str = Field(description="The sentence in the document that started this concept")

class MainContentList(BaseModel):
    content_list : List[MainContent]

output_parser = PydanticOutputParser(pydantic_object=MainContent)
combine_output_parser = PydanticOutputParser(pydantic_object=MainContentList)

map_prompt = PromptTemplate(
    template='''
    Summarize the map concept of this text using a question the text is delimited by ``` 
    {format_instructions}
    ```
    {text}
    ```
    ''',
    input_variables=['text'],
    partial_variables={'format_instructions' : output_parser.get_format_instructions()}
)
# Prompt
combine_prompt = PromptTemplate(
                        template='''
                        From these questions and sentence create a list of main contents
                        {format_instructions}
                        {question}
                        ''',
                        input_variables=['question'],
                        partial_variables={"format_instructions": combine_output_parser.get_format_instructions()})

In [15]:
from langchain import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
map_chain = LLMChain(llm=llm,prompt=map_prompt)
reduce_chain = LLMChain(llm=llm, prompt=combine_prompt)

In [17]:
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="question"
)
reduce_documents_chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=combine_documents_chain,
    token_max=4000)
map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=map_chain,
    reduce_documents_chain=reduce_documents_chain,
    document_variable_name="text",
    return_intermediate_steps=False)

In [18]:
output = map_reduce_chain.run(docs)
new_topics = combine_output_parser.parse(output)

In [22]:
new_topics.content_list

[MainContent(main_content='What motivated OpenAI to go in the direction of neural networks?', sentence='Pre-AlexNet, nothing in deep learning was really working. And then given that environment, you guys took a very unique bet, what motivated you to go in this direction?'),
 MainContent(main_content='How did the research agenda evolve and what drove it towards transformer-based models and other forms of learning?', sentence='And then there was some video game-related work, which was really cutting edge. How did you think about how the research agenda evolved and what really drove it down this path of transformer-based models and other forms of learning?'),
 MainContent(main_content='What is the role of open source models in the ecosystem?', sentence="Well, open source is complicated. I'll describe to you my mental picture. I think that in the near term, open source is just helping companies produce useful."),
 MainContent(main_content='What is the goal of the super alignment project an