In [6]:
import os

os.environ["OPENAI_API_KEY"] = ''
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

In [7]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)

with open('transcript.txt','r') as f:
    transcript = f.read()
    texts = text_splitter.split_text(transcript)


In [8]:
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)
docsearch.embedding_function

<bound method OpenAIEmbeddings.embed_query of OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False)>

In [9]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
# set up FAISS as a generic retriever 
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":4})

# create the chain to answer questions 
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)


In [12]:
query = "What is the main content of the transcript and summarize it using the the timestamps of each important section in the transcript"
rqa(query)['result']

' The transcript is about Artificial Intelligence (AI) and the progress it has made. It discusses how AI will affect us in the future and how people are working together to ensure that AI will never go rogue. It begins from 0:00:04 to 0:02:57 discussing the progress of AI and how it is becoming widespread. From 0:02:58 to 0:10:41, the speaker talks about how he got into AI and how he is working to ensure that AI will not go rogue. From 0:10:43 to 0:09:37 the speaker discusses the excitement and investment in the AI field.'

In [13]:
import os
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List

# Define a new Pydantic model with field descriptions and tailored for Twitter.
class TranscriptionChapter(BaseModel):
    summarization: str = Field(description="A short summarization of the transcript")
    chapter: List[str] = Field(description="List of main contents in the transcript")
    time_stamps: List[str] = Field(description="List of timestamps based on the main contents")
    


In [21]:
# Instantiate the parser with the new model.
output_parser = PydanticOutputParser(pydantic_object=TranscriptionChapter)

# Update the prompt to match the new query and desired format.
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(
            "{question} in chapters and also mark the chapter with the correct timestamps from the transcript"
        )
    ],
    input_variables=['question'],
    partial_variables={
        "format_instructions": output_parser.get_format_instructions(),
    },
)

llm = ChatOpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])

_input = prompt.format_prompt(question='Summarize the transcript')

# create the chain to answer questions 
rqa = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever,
                                  return_source_documents=True)
type(_input.to_messages())
#rqa(_input.to_messages())['result']

list