In [40]:
%%capture 
%pip install python-dotenv pandas numpy

In [41]:
%%capture
%pip install langchain langchain-community langchain-core langchain-openai  openai

In [42]:
%pip list | grep langchain

langchain                 0.2.16
langchain-community       0.2.16
langchain-core            0.2.38
langchain-openai          0.1.23
langchain-text-splitters  0.2.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [43]:
%%capture
%pip install jupyterlab_execute_time

In [44]:
def save_string_to_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [45]:
import dotenv
import os
dotenv.load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

from langchain.chains import RefineDocumentsChain, MapReduceDocumentsChain, LLMChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
#from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.schema.runnable import RunnableSequence
from langchain.schema import StrOutputParser



In [46]:
def split_file(filepath):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=50000, chunk_overlap=200)
    with open(filepath, 'r') as file:
        text = file.read()
    docs = text_splitter.create_documents([text])
    print(f'{filepath} docs:{len(docs)}')
    return docs

In [51]:
user_command="""
Provide a detailed analysis of the Lex Fridman Podcast episode. 
Your summary should be comprehensive yet well-structured, covering the breadth and depth of the discussion. 
Organize your response under the following headings:

## The main arguments 

---
instruction for this section
- List at least 5 key arguments or points made during the episode
- provide a brief explanation and its significance in the context of the discussion
- Highlight any counterarguments or alternative perspectives mentioned
---

## Any notable quotes 

---
instruction for this section
- Include at least 5 direct quotes that encapsulate important ideas or memorable moments
- provide a sentence of context explaining its relevance or impact
---

## Relevant topics or themes

---
instruction for this section
- Identify and explain at least 5 major themes or topics discussed in the episode
- provide examples of how it was explored in the conversation
- Note any connections between different themes or how they relate to broader societal issues
---

Additional Guidelines:

Use clear and concise bullet points under each heading
Provide brief explanations for any specialized terms, events, or concepts mentioned
Highlight any unique aspects of the episode, such as the interviewing style or guest's background
Include a mix of factual information, personal anecdotes shared by the guest, and any hypothetical scenarios discussed
If applicable, note any evolution in the guest's perspectives throughout the conversation
"""

In [52]:
llm = ChatOpenAI(temperature=0, model_name='gpt-4o-mini')
def summerise(docs):
    document_prompt = PromptTemplate(input_variables=["page_content"], template="{page_content}")
    initial_prompt = PromptTemplate.from_template(user_command + ": {context}")
    initial_chain = LLMChain(llm=llm, prompt=initial_prompt)
    
    refine_prompt = PromptTemplate.from_template(
        "Here's a summary so far: {prev_response}\n" +
        "Now refine it with this additional context: {context}\n" +
        user_command
    )
    refine_chain = LLMChain(llm=llm, prompt=refine_prompt)
    
    chain = RefineDocumentsChain(
        initial_llm_chain=initial_chain,
        refine_llm_chain=refine_chain,
        document_prompt=document_prompt,
        document_variable_name="context",
        initial_response_name="prev_response",
        verbose=True
    )
    
    # Generate the summary
    summary_refine = chain.invoke(docs)
    
    print(f'summary: {len(summary_refine['output_text'])}')
    return summary_refine['output_text']

## Summerise in batch

In [53]:
folder_path = "../1-raw/transcripts-podcast/"

# Iterate through all files in the folder
for filename in os.listdir(folder_path)[:5]:
    file_path = os.path.join(folder_path, filename)
    print(filename)

    # invoke the summary
    docs=split_file(file_path)
    summary=summerise(docs)

    # save to file
    summary_filename=filename.replace('.txt','_summary.md')
    save_string_to_file(summary, f"./transcript-summary/{summary_filename}")

walter_isaacson_transcript.txt
../1-raw/transcripts-podcast/walter_isaacson_transcript.txt docs:3


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 6336
bassem_youssef_transcript.txt
../1-raw/transcripts-podcast/bassem_youssef_transcript.txt docs:4


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 7061
jared_kushner_transcript.txt
../1-raw/transcripts-podcast/jared_kushner_transcript.txt docs:6


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 6831
george_hotz_3_transcript.txt
../1-raw/transcripts-podcast/george_hotz_3_transcript.txt docs:6


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 6210
james_sexton_transcript.txt
../1-raw/transcripts-podcast/james_sexton_transcript.txt docs:5


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 6090
