In [2]:
# Loaders
from langchain.schema import Document

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Model
from langchain.chat_models import ChatOpenAI

# Embedding Support
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# Summarizer we'll use for Map Reduce
from langchain.chains.summarize import load_summarize_chain

# Data Science
import numpy as np
from sklearn.cluster import KMeans
import os

os.environ["OPENAI_API_KEY"] = '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'

In [5]:

llm = ChatOpenAI(temperature=0, openai_api_key='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')

text_splitter = RecursiveCharacterTextSplitter(separators=["."], chunk_size=10000, chunk_overlap=3000)

with open('transcript_text.txt','r') as f:
    transcript = f.read()
    docs = text_splitter.create_documents([transcript])

num_docs = len(docs)

num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 6 documents and the first one has 2103 tokens


In [6]:
embeddings = OpenAIEmbeddings()

vectors = embeddings.embed_documents([x.page_content for x in docs])

In [8]:
# Assuming 'embeddings' is a list or array of 1536-dimensional embeddings

# Choose the number of clusters, this can be adjusted based on the book's content.
# I played around and found ~10 was the best.
# Usually if you have 10 passages from a book you can tell what it's about
num_clusters = 5

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
# Find the closest embeddings to the centroids

# Create an empty list that will hold your closest points
closest_indices = []

# Loop through the number of clusters you have
for i in range(num_clusters):
    
    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
    
    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index = np.argmin(distances)
    
    # Append that position to your closest indices list
    closest_indices.append(closest_index)
    
selected_indices = sorted(closest_indices)
selected_indices

[0, 1, 2, 4, 5]

In [27]:
from pydantic import BaseModel, Field, conlist
from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain import LLMChain

class MainContent(BaseModel):
    main_content : str = Field(description='Summarize the main concept of the document in a short phrase (3 to 5 words) , like a book chapter')
    sentence : str = Field(description="The sentence in the document that started discussing this concept")

output_parser = PydanticOutputParser(pydantic_object=MainContent)

map_prompt = PromptTemplate(
    template='''
    Summarize the map concept of this text using a question the text is delimited by ``` 
    {format_instructions}
    ```
    {text}
    ```
    ''',
    input_variables=['text'],
    partial_variables={'format_instructions' : output_parser.get_format_instructions()}
)

llm_chain = LLMChain(llm=llm, prompt=map_prompt)

selected_docs = [docs[doc] for doc in selected_indices]

In [28]:
combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain, document_variable_name="text"
)

In [29]:
# Make an empty list to hold your summaries
summary_list = []

# Loop through a range of the lenght of your selected docs
for i, doc in enumerate(selected_docs):
    
    # Go get a summary of the chunk
    chunk_summary = combine_documents_chain.run([doc])
    output = output_parser.parse(chunk_summary)
    # Append that summary to your list
    summary_list.append(output)

In [30]:
summary_list

[MainContent(main_content="OpenAI's goal and evolution", sentence='The goal of open AI from the very beginning has been to make sure that artificial general intelligence by which remain autonomous systems, AI that can actually do most of the jobs and activities and tasks that people do, benefits all of humanity.'),
 MainContent(main_content="OpenAI's goal of benefiting humanity through artificial general intelligence", sentence='The goal of open AI from the very beginning has been to make sure that artificial general intelligence by which remain autonomous systems, AI that can actually do most of the jobs and activities and tasks that people do, benefits all of humanity.'),
 MainContent(main_content='Role of open source models in the ecosystem', sentence="Well, open source is complicated. I'll describe to you my mental picture. I think that in the near term, open source is just helping companies produce useful."),
 MainContent(main_content='The concept of super alignment and the import