In [1]:
%%capture 
%pip install python-dotenv pandas numpy

In [2]:
%%capture
%pip install langchain langchain-community langchain-core langchain-openai  openai

In [3]:
%pip list | grep langchain

langchain                 0.2.16
langchain-community       0.2.16
langchain-core            0.2.38
langchain-openai          0.1.23
langchain-text-splitters  0.2.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [26]:
%%capture
%pip install jupyterlab_execute_time

In [4]:
def save_string_to_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [10]:
import dotenv
import os
dotenv.load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

from langchain.chains import RefineDocumentsChain, MapReduceDocumentsChain, LLMChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
#from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.schema.runnable import RunnableSequence
from langchain.schema import StrOutputParser



In [22]:
def split_file(filepath):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=50000, chunk_overlap=200)
    with open(filepath, 'r') as file:
        text = file.read()
    docs = text_splitter.create_documents([text])
    print(f'{filepath} docs:{len(docs)}')
    return docs

In [12]:
user_command="""
Analysis the podcast episode from the Lex Fridman Podcast series. Summarise the key points discussed in the episode with a focus on: 
## The main arguments presented 
## Any notable quotes from the guest 
## Relevant topics or themes covered
Ensure the summary is clear and concise, with bullet points under the specified headings
"""

In [37]:
llm = ChatOpenAI(temperature=0, model_name='gpt-4o-mini')
def summerise(docs):
    document_prompt = PromptTemplate(input_variables=["page_content"], template="{page_content}")
    initial_prompt = PromptTemplate.from_template(user_command + ": {context}")
    initial_chain = LLMChain(llm=llm, prompt=initial_prompt)
    
    refine_prompt = PromptTemplate.from_template(
        "Here's a summary so far: {prev_response}\n" +
        "Now refine it with this additional context: {context}\n" +
        user_command
    )
    refine_chain = LLMChain(llm=llm, prompt=refine_prompt)
    
    chain = RefineDocumentsChain(
        initial_llm_chain=initial_chain,
        refine_llm_chain=refine_chain,
        document_prompt=document_prompt,
        document_variable_name="context",
        initial_response_name="prev_response",
        verbose=True
    )
    
    # Generate the summary
    summary_refine = chain.invoke(docs)
    
    print(f'summary: {len(summary_refine['output_text'])}')
    return summary_refine['output_text']

## Summerise in batch

In [39]:
folder_path = "../1-raw/transcripts-podcast/"

# Iterate through all files in the folder
for filename in os.listdir(folder_path)[:5]:
    file_path = os.path.join(folder_path, filename)
    print(filename)

    # invoke the summary
    docs=split_file(file_path)
    summary=summerise(docs)

    # save to file
    summary_filename=filename.replace('.txt','_summary.md')
    save_string_to_file(summary, f"./{summary_filename}.md")

walter_isaacson_transcript.txt
../1-raw/transcripts-podcast/walter_isaacson_transcript.txt: 3


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 2928
bassem_youssef_transcript.txt
../1-raw/transcripts-podcast/bassem_youssef_transcript.txt: 4


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 3258
jared_kushner_transcript.txt
../1-raw/transcripts-podcast/jared_kushner_transcript.txt: 6


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 3378
george_hotz_3_transcript.txt
../1-raw/transcripts-podcast/george_hotz_3_transcript.txt: 6


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 3953
james_sexton_transcript.txt
../1-raw/transcripts-podcast/james_sexton_transcript.txt: 5


[1m> Entering new RefineDocumentsChain chain...[0m

[1m> Finished chain.[0m
summary: 3519
