In [1]:
import os
os.environ["OPENAI_API_KEY"] = "GET YOUR OWN KEY"

In [2]:
!ls md_transcripts

01. How Your Nervous System Works & Changes.md
02. Master Your Sleep & Be More Alert When Awake.md
03. Using Science to Optimize Sleep, Learning & Metabolism.md
04. Find Your Temperature Minimum to Defeat Jetlag, Shift Work & Sleeplessness .md
05. Understanding and Using Dreams to Learn and to Forget.md
06. How to Focus to Change Your Brain.md
07. Using Failures, Movement & Balance to Learn Faster.md
08. Optimize Your Learning & Creativity with Science-based Tools.md
09. Control Pain & Heal Faster with Your Brain.md
10. Tools for Managing Stress & Anxiety.md
11. How Foods and Nutrients Control Our Moods.md
12. How to Increase Motivation & Drive.md
13. The Science of Emotions & Relationships.md
14. Biological Influences on Sex, Sex Differences & Preferences.md
15. The Science of How to Optimize Testosterone & Estrogen.md
16. How Our Hormones Control Our Hunger, Eating & Satiety.md
17. How to Control Your Metabolism by Thyroid & Growth Hormone.md
18. Using Cortisol & Adrenaline to Boost 

In [6]:
from langchain import __version__
print(__version__)

0.0.161


In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate

In [8]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import re

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
def count_tokens(text):
    return len(encoding.encode(text))

class TranscriptLoader(object):

    def __init__(self, path, chunk_size=1000, 
                 chunk_overlap=None, model_name="gpt-3.5-turbo"):
        self.path = path
        self.chunk_size = chunk_size
        if chunk_overlap is None:
            chunk_overlap = chunk_size // 2
        self.chunk_overlap = chunk_overlap
        self.model_name = model_name
        self.markdown_text = None
        self.texts = None
        self.title = None
        self.metadatas = None

    def parse_text(self):
        """
        Load the text from the markdown file, parse it into 
        individual chunks, and parse metadata for each chunk.
        Return list of Documents.
        """
        with open(self.path, 'r') as f:
            text = f.read()
        self.markdown_text = text
        self.texts = re.findall(r'</summary>(.*?)</details>', 
                                self.markdown_text, re.DOTALL)
        match = re.search(r'\*\*(.*?)\*\*', text)
        if match:
            self.title = match.group(1)
        else:
            self.title = ""
        chunk_summaries = re.findall(r'<summary>(.*?)</summary>', 
                                     self.markdown_text, re.DOTALL)
        self.metadatas = [{
                            "title": self.title,
                            "summary": summary,
                            "source": f"{self.title} {summary}"
                          }
                          for summary in chunk_summaries]
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_size // 2,
            length_function=count_tokens,
        )
        return splitter.create_documents(self.texts, self.metadatas)
        
class DocSearch(object):
    """
    Given a path to a folder of markdown files, load the files,
    parse them into chunks, and index them for search.
    """
    def __init__(self, path, chunk_size=1000, db_persist_dir=None):
        self.path = path
        self.chunk_size = chunk_size
        self.db_persist_dir = db_persist_dir
        self.documents = []
        self.docstore = None
        self.vectorstore = None
        self.embeddings = None
        self.splitter = None
        self.searcher = None

    def load_documents(self):
        """
        Load the documents from the markdown files.
        """
        self.documents = []
        for filename in os.listdir(self.path):
            if filename.endswith(".md"):
                loader = TranscriptLoader(os.path.join(self.path, filename))
                loader.parse_text()
                self.documents.extend(loader.parse_text())
        print(f"Loaded {len(self.documents)} documents.")
    
    def index_documents(self):
        """
        Index the documents for search.
        """
        embeddings = OpenAIEmbeddings()
        # try loading the persisted database from disk first
        if self.db_persist_dir and os.path.exists(self.db_persist_dir):
            self.db = Chroma(persist_directory=self.db_persist_dir,
                             embedding_function=embeddings)
            return
        # otherwise, index the documents
        if self.documents is None:
            self.load_documents()
        self.db = Chroma.from_documents(self.documents, embeddings, 
            persist_directory=self.db_persist_dir)
        if self.db_persist_dir:
            self.db.persist()

    def search(self, query, k=4, search_type="mmr"):
        """
        Search the documents for the query.
        Parameters:
            query: str
                The query to search for.
            n: int
                The number of results to return.
            search_type: str
                The type of search to perform. Options are:
                    "mmr": Maximal Marginal Relevance
                    "similarity": Similarity search
        """
        if self.db is None:
            self.index_documents()
        return self.db.search(query, k=k, search_type=search_type)

        
# docsearch = DocSearch("md_transcripts", db_persist_dir="db")
docsearch = DocSearch("md_transcripts", chunk_size=480)
docsearch.load_documents()
docsearch.index_documents()
# docsearch.search("What are some nootropics that would help me learn a language faster?")
        

Loaded 2203 documents.


Using embedded DuckDB without persistence: data will be transient


In [9]:
foo = docsearch.search("How would caffeine, alpha GPC, L-tyrosine, Adderall, and Modafinil\
 affect my ability to learn a language?", k=20)

print(sum([count_tokens(doc.page_content) for doc in foo]))

11278


In [16]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

class QA(object):

    # def __init__(self, docsearch: DocSearch):
    #     self.docsearch = docsearch
    #     # create a chain of the following:
    #     # 1. pull relevant documents from the database
    #     # 2. preprocess them a little bit - ask to remove superfluous stuff
    #     # not relevant to the question
    #     # 3. sort them in order of most to least relevant
    #     # 4. stuff them into the prompt of the GPT-3.5 model
    #     assert False

    # def answer(self, question, k=4):
    #     # search the database for relevant documents
    #     docs = self.docsearch.search(question, k=k)
    #     # stuff them into the prompt of the GPT-3.5 model
    #     # return the answer
    #     assert False

    def __init__(self, docsearch: DocSearch, model_name="gpt-3.5-turbo",
                 max_tokens_limit=4097):
        self.docsearch = docsearch
        self.chain = RetrievalQAWithSourcesChain.from_chain_type(
            OpenAI(
            # ChatOpenAI(
                # model_name="gpt-3.5-turbo",
                model_name=model_name,
                # model_name="gpt-4-0314",
                temperature=0,
            ), 
            # chain_type="map_reduce",
            chain_type="stuff",
            retriever=docsearch.db.as_retriever(),
            # reduce_k_below_max_tokens=False,
            reduce_k_below_max_tokens=True,
            max_tokens_limit=max_tokens_limit,
            verbose=True,
        )

    def answer(self, question):
        return self.chain({
            "question": question,
        }, callbacks=[StdOutCallbackHandler()])
    
# qa = QA(docsearch, "text-davinci-003")
qa = QA(docsearch, "gpt-3.5-turbo")
# qa = QA(docsearch, "davinci", max_tokens_limit=2048)
result = qa.answer("What are some nootropics that would \
help me learn a language faster? Please include an \
explanation for each one.")
print(result["answer"])
print(result["sources"])





[1m> Entering new RetrievalQAWithSourcesChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not const

In [24]:
docsearch.search("The acetylcholine stimulation traditionally comes from Coleen donors or alpha GPC, things of that sort. And then you would want to have some sort of off switch, because anything that's going to really stimulate your alertness, that then provides a crash.", search_type="similarity")

[Document(page_content="traditionally from caffeine stimulation. The acetylcholine stimulation traditionally comes from Coleen donors or alpha GPC, things of that sort. And then you would want to have some sort of off switch, because anything that's going to really stimulate your alertness, that then provides a crash. That crash is not a crash into the deep kind of restful slumber that you would want for learning, it's a crash into the kind of, let's just call it lopsided sleep, meaning it's deep sleep but it lacks certain spindles and other elements of the physiology sleep spindles, that really engage the learning process and the reconfiguration of synopsis. So right now, my stance on nootropics is that maybe, maybe for occasional use, provided it's safe for you, I'm not recommending it, but in general it tends to use more of a shotgun approach than is probably going to be useful for learning and memory in the long run. A lot of people ask about Modafinil or armodafinil which was desi

In [17]:
import pprint

pprint.pprint(result["answer"])
pprint.pprint(result["sources"])

('There are several nootropics that may help with learning a language faster, '
 'including caffeine for increased focus, alpha GPC for increased '
 'acetylcholine, and L-tyrosine for increased dopamine. However, it is '
 'important to approach these with caution and to prioritize sleep and deep '
 'rest for optimal learning and plasticity. There is no specific pill or '
 'chemical that can allow for instant language learning, and it is important '
 'to focus on behavioral tools and structured learning bouts. \n')
('Using Science to Optimize Sleep, Learning & Metabolism | Huberman Lab '
 'Podcast #3 (00:54:05) Smart Drugs, Using Failures, Movement & Balance to '
 'Learn Faster | Huberman Lab Podcast #7 (1:19:25) Learning French and Other '
 'Things Faster')


In [20]:
result = qa.answer("What is L-tyrosine, what does it do, and how does it work?")
# result = qa.answer("What is L-tyrosine?")
pprint.pprint(result["answer"])
pprint.pprint(result["sources"])



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not const

In [25]:
foo = qa.answer("Why is it so important to have sunlight exposure in the morning?")
pprint.pprint(foo["answer"])
pprint.pprint(foo["sources"])



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not const

In [21]:
pprint.pprint(result["sources"])

''


In [None]:
import pprint

pprint.pprint(result["answer"])


('There are some nootropics that can increase focus and acetylcholine '
 'stimulation, such as caffeine, Coleen donors, and alpha GPC, but there is no '
 'specific pill or chemical that will allow one to download more information '
 'more quickly. Most nootropics tend to bundle a bunch of things together, '
 'including stimulants and acetylcholine stimulators, but they tend to use '
 'more of a shotgun approach than is probably going to be useful for learning '
 'and memory in the long run. Modafinil or armodafinil can improve learning '
 'and memory, but they are expensive and designed as stimulants for the '
 'treatment of narcolepsy. The use of nootropics should be approached with '
 'caution and occasional use, if safe for the individual. \n')


In [133]:
qa.answer("What are some nootropics that would \
help me learn a language faster?")

{'question': 'What are some nootropics that would help me learn a language faster?',
 'answer': ' Nootropics that may help with learning a language faster include caffeine, alpha GPC, L-tyrosine, Adderall, and Modafinil. However, these should be approached with caution and only used occasionally.\n',
 'sources': 'Using Failures, Movement & Balance to Learn Faster | Huberman Lab Podcast #7 (1:19:25) Learning French and Other Things Faster\nUsing Science to Optimize Sleep, Learning & Metabolism | Huberman Lab Podcast #3 (00:54:05) Smart Drugs'}

In [132]:
import pprint
result = qa.answer("How would caffeine, alpha GPC, L-tyrosine, Adderall, and Modafinil\
 affect my ability to learn a language?")

pprint.pprint(result["answer"])
pprint.pprint(result["sources"])

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 4391 tokens (4135 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

In [128]:

for d in docsearch.search("What is Alpha GPC?"):
    print(count_tokens(d.page_content))

sum([count_tokens(d.page_content) for d in docsearch.search("What is Alpha GPC?")])
# sum([count_tokens(str(d.metadata)) for d in docsearch.search("What is Alpha GPC?")])

978
621
514
525


2638

In [131]:
import pprint

pprint.pprint(docsearch.search("What is Alpha GPC?")[0].metadata)

{'source': 'How to Learn Skills Faster | Huberman Lab Podcast #20 (01:39:00) '
           'Ingestible Compounds That Support Skill Learning: Motivation, '
           'Repetitions, Alpha-GPC',
 'summary': '(01:39:00) Ingestible Compounds That Support Skill Learning: '
            'Motivation, Repetitions, Alpha-GPC',
 'title': 'How to Learn Skills Faster | Huberman Lab Podcast #20'}


In [117]:
result = qa.answer("What is Alpha GPC?")
print(result["answer"])
print(result["sources"])

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 4541 tokens (4285 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

In [89]:
docs = docsearch.search("What are some nootropics that would help me learn a language faster?")

docs[0].metadata["summary"]

 Document(page_content="\n-\n  \nWhich brings us to the next thing about learning and plasticity which is nootropics, AKA smart drugs. [sighs] This is a big topic that sigh was a sigh of concern about how to address nootropics in a thorough enough, but thoughtful enough way. Look, I have a lot of thoughts about nootropics. First of all, it means smart drugs, I believe. And I don't like that phrase because let's just take a step back and think about exercise. You just say, I want to be more physically fit. What does that mean? Does it mean I would ask for more specificity, I'd say, Do you want to be stronger? Okay, maybe you need to lift heavier objects progressively. Do you want more endurance very different protocol to access endurance. Do you want flexibility? Do you want explosiveness or suppleness? Huge range of things that we call physical fitness. Maybe you want all of those. If we were talking about emotional fitness we would say, well, inability to feel empathy but probably als

In [59]:
# </details>

# <details>

from langchain.text_splitter import CharacterTextSplitter

markdown_path = "md_transcripts/01. How Your \
Nervous System Works & Changes.md"
with open(markdown_path) as f:
    markdown_text = f.read()

separator_str = "</details>\n\n<details>"

splitter = CharacterTextSplitter(chunk_size=1000,
                                         chunk_overlap=100,
                                         separator=separator_str)
# docs = splitter.create_documents([markdown_text])
texts = splitter.split_text(markdown_text)

embeddings = OpenAIEmbeddings()

Created a chunk of size 5267, which is longer than the specified 1000
Created a chunk of size 3991, which is longer than the specified 1000
Created a chunk of size 2007, which is longer than the specified 1000
Created a chunk of size 2573, which is longer than the specified 1000
Created a chunk of size 1121, which is longer than the specified 1000
Created a chunk of size 3077, which is longer than the specified 1000
Created a chunk of size 1054, which is longer than the specified 1000
Created a chunk of size 1638, which is longer than the specified 1000
Created a chunk of size 1105, which is longer than the specified 1000
Created a chunk of size 2601, which is longer than the specified 1000
Created a chunk of size 3258, which is longer than the specified 1000
Created a chunk of size 1047, which is longer than the specified 1000
Created a chunk of size 4688, which is longer than the specified 1000
Created a chunk of size 3415, which is longer than the specified 1000
Created a chunk of s

In [60]:
# from langchain.text_splitter import MarkdownTextSplitter

# markdown_path = "md_transcripts/01. How Your \
# Nervous System Works & Changes.md"
# with open(markdown_path) as f:
#     markdown_text = f.read()

# markdown_splitter = MarkdownTextSplitter(chunk_size=10**4,
#                                          chunk_overlap=100)
# docs = markdown_splitter.create_documents([markdown_text])

# embeddings = OpenAIEmbeddings()

In [70]:
print(texts[0] + texts[1])

**How Your Nervous System Works & Changes | Huberman Lab Podcast #1**

[**https://www.youtube.com/watch?v=H-XfCl-HpRM&t=2s**](https://www.youtube.com/watch?v=H-XfCl-HpRM&t=2s)

**TRANSCRIPT**

<details>
<summary>(00:00) Introduction</summary>
-
    
Welcome to the Huberman Lab Podcast where we discuss science and science-based tools for everyday life. (upbeat guitar music) I'm Andrew Huberman and I'm a professor of neurobiology and ophthalmology at Stanford School of Medicine. For today's podcast we're going to talk about the parts list of the nervous system. Now that might sound boring, but these are the bits and pieces that together make up everything about your experience of life, from what you think about to what you feel, what you imagine, and what you accomplish from the day you're born until the day you die. That parts list is really incredible because it has a history associated with it that really provides a window into all sorts of things like engineering, warfare, religion, 

In [62]:
docsearch = Chroma.from_texts(texts, embeddings, 
    metadatas=[{"source": str(i)} for i in range(len(texts))]
)

Using embedded DuckDB without persistence: data will be transient


In [63]:
query = "What is Deja Vu?"
docs = docsearch.similarity_search(query)
docs

[Document(page_content="<summary>(08:55) Deja Vu</summary>\n-\n    \nBut the language of the nervous system is just electricity. It's just like a Morse code of some sort or the syllables and words and consonants and vowels of language. It just depends on how they're assembled, what order. And so that brings us to the issue of how the nervous system works. The way to think about how the nervous system works is that our experiences, our memories, everything is sort of like the keys on a piano being played in a particular order, right? If I play the keys on a piano in a particular order and with a particular intensity, that's a given song. We can make that analogous to a given experience. It's not really that the key, you know, A sharp or E flat is the song. It's just one component of the song. So when you hear that, you know, for instance, there's a brain area called the hippocampus, which there is, that's involved in memory. Well, it's involved in memory, but it's not that memories are 

In [64]:
len(texts), len(docs)

(20, 4)

In [68]:
sum([len(t) for t in texts])/len(texts)

3212.35

In [65]:
markdown_text.count("<details>")

21

In [66]:
print(texts[0])
print("=====================================")
print(texts[1])

**How Your Nervous System Works & Changes | Huberman Lab Podcast #1**

[**https://www.youtube.com/watch?v=H-XfCl-HpRM&t=2s**](https://www.youtube.com/watch?v=H-XfCl-HpRM&t=2s)

**TRANSCRIPT**

<details>
<summary>(00:00) Introduction</summary>
-
    
Welcome to the Huberman Lab Podcast where we discuss science and science-based tools for everyday life. (upbeat guitar music) I'm Andrew Huberman and I'm a professor of neurobiology and ophthalmology at Stanford School of Medicine. For today's podcast we're going to talk about the parts list of the nervous system. Now that might sound boring, but these are the bits and pieces that together make up everything about your experience of life, from what you think about to what you feel, what you imagine, and what you accomplish from the day you're born until the day you die. That parts list is really incredible because it has a history associated with it that really provides a window into all sorts of things like engineering, warfare, religion, 

In [4]:
from langchain.document_loaders import UnstructuredMarkdownLoader

markdown_path = "md_transcripts/01. How Your Nervous System Works & Changes.md"
loader = UnstructuredMarkdownLoader(markdown_path)

data = loader.load()
print(data)

[nltk_data] Downloading package punkt to /Users/john/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/john/nltk_data...


[Document(page_content='How Your Nervous System Works & Changes | Huberman Lab Podcast #1\n\nhttps://www.youtube.com/watch?v=H-XfCl-HpRM&t=2s\n\nTRANSCRIPT\n\nWritten with StackEdit.', metadata={'source': 'md_transcripts/01. How Your Nervous System Works & Changes.md'})]


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [10]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

Using embedded DuckDB without persistence: data will be transient


In [11]:
query = "What is this episode about?"
index.query(query)

Chroma collection langchain contains fewer than 4 elements.
Chroma collection langchain contains fewer than 3 elements.
Chroma collection langchain contains fewer than 2 elements.


' This episode is about how the nervous system works and changes.'