# Setup and Initial Data Handling

### Hyperparameters

In [None]:
import torch

batch_size = 32
block_size = 8
max_iter = 3000
eval_interval = 100
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

## Import the data 

In [74]:
import pandas as pd

df = pd.read_parquet("hf://datasets/dexaai/huberman_on_exercise/data/data-00000-of-00001-8e5e40fbf9236004.parquet")
df.head()

Unnamed: 0,id,embedding,metadata,document
0,chunk_57169,"[-0.03923165, -0.0016832227, 0.014283108, -0.0...","{'chunkTitle': 'Muscle Hypertrophy', 'episodeT...",Andrew Huberman: And that leads me to a questi...
1,chunk_57178,"[-0.02918509, -0.018408643, 0.017951787, -0.03...",{'chunkTitle': 'Hypertrophy Training Overview'...,Andrew Huberman: So as you point out before an...
2,chunk_57177,"[-0.030115174, -0.01321885, 0.02527428, -0.005...","{'chunkTitle': 'Fitness and Hypertrophy', 'epi...",Andy Galpin: Sure. So we have a lot less infor...
3,chunk_57176,"[-0.028269125, 0.006626837, 0.022055835, -0.00...","{'chunkTitle': 'Cardio and Hypertrophy', 'epis...",Andrew Huberman: How do other forms of exercis...
4,chunk_57175,"[-0.025013695, 0.00445241, 0.029148076, -0.009...","{'chunkTitle': 'Indicators and Adjustments', '...",Andy Galpin: You could also look at things lik...


## Remove Embedding and Preprocess Data

### Clean data

In [75]:
# Remove Second Column (Embedding)
df = df.drop(columns=['embedding'])

import re
# Remove punctuation (keeps numbers and capitalization)
# df['document'] = df['document'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df.head()

Unnamed: 0,id,metadata,document
0,chunk_57169,"{'chunkTitle': 'Muscle Hypertrophy', 'episodeT...",Andrew Huberman: And that leads me to a questi...
1,chunk_57178,{'chunkTitle': 'Hypertrophy Training Overview'...,Andrew Huberman: So as you point out before an...
2,chunk_57177,"{'chunkTitle': 'Fitness and Hypertrophy', 'epi...",Andy Galpin: Sure. So we have a lot less infor...
3,chunk_57176,"{'chunkTitle': 'Cardio and Hypertrophy', 'epis...",Andrew Huberman: How do other forms of exercis...
4,chunk_57175,"{'chunkTitle': 'Indicators and Adjustments', '...",Andy Galpin: You could also look at things lik...


### Tokenize Data

In [76]:
# Convert to list in order to combine all documents into one list
corpus = df['document'].tolist()
# print(type(corpus))
corpus[:5]

["Andrew Huberman: And that leads me to a question that is based on findings that I've heard discussed on social media, which means very little, if anything, unless it's in the context of people who really know exercise science. And you're one such person. And that's this idea that because resistance training can evoke a protein synthesis adaptation response, but that adaptation response lasts about 48 hours before it starts to taper off. That the ideal. In quotes, frequency for training a given muscle group for hypertrophy is about every 48 hours. Is that true?\n\nAndy Galpin: Yes and no. So a couple of things there remember, in order to grow a muscle, there's multiple steps here. So you have the signaling response, which actually happens within seconds of exercise and can last, depending on the marker, up to an hour or 2 hours. Step number two then is gene expression, and we see that that's typically peaked around two to 6 hours post exercise. And then you have following that protein

In [77]:
# Convert list to string by joining all documents
text = " ".join(corpus)
# print(text[:1000])
print(len(text)) # The number of chars in the corpus
# print(type(text))

812479


In [None]:
# Create a list of all unique characters in the text
vocabulary = sorted(list(set(text)))
vocabulary_size = len(vocabulary)

print('Vocabulary Size: ', vocabulary_size) # The number of unique characters in the text
print('Vocabulary in text: ', ''.join(vocabulary))

Vocabulary Size: 76
Vocabulary in text:  
 "$%',-./0123456789:?@ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz£


### Text Vectorization

#### Convert tokens into numerical representations. Starting with:
#### Bag-of-Words (BoW)
#### TF-IDF


In [108]:
import tiktoken
# print(tiktoken.list_encoding_names())
encoder = tiktoken.get_encoding('o200k_base')
# assert encoder.decode(encoder.encode("hello world")) == "hello world"
print(encoder.decode(encoder.encode("hello world")))

hello world


In [None]:
# Map each character to an index
stoi = {c: i for i, c in enumerate(vocabulary)}
itos = {i: c for i, c in enumerate(vocabulary)}
encode = lambda x: [stoi[c] for c in x] # encode('a') -> 0, takes a character and returns its index in the vocabulary
decode = lambda s: ''.join(itos[i] for i in s) # decode([0, 1, 2]) -> 'abc', takes a list of indices and returns the corresponding string
print('Encoded:', encode("andrew huberman"))
print('Decoded:', decode(encode("andrew huberman")))

Encoded: [49, 62, 52, 66, 53, 71, 1, 56, 69, 50, 53, 66, 61, 49, 62]
Decoded: andrew huberman


In [None]:
# Encode the entire text with torch
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type())
print(data[:1000])

torch.Size([812479]) torch.LongTensor
tensor([23, 62, 52, 66, 53, 71,  1, 30, 69, 50, 53, 66, 61, 49, 62, 20,  1, 23,
        62, 52,  1, 68, 56, 49, 68,  1, 60, 53, 49, 52, 67,  1, 61, 53,  1, 68,
        63,  1, 49,  1, 65, 69, 53, 67, 68, 57, 63, 62,  1, 68, 56, 49, 68,  1,
        57, 67,  1, 50, 49, 67, 53, 52,  1, 63, 62,  1, 54, 57, 62, 52, 57, 62,
        55, 67,  1, 68, 56, 49, 68,  1, 31,  5, 70, 53,  1, 56, 53, 49, 66, 52,
         1, 52, 57, 67, 51, 69, 67, 67, 53, 52,  1, 63, 62,  1, 67, 63, 51, 57,
        49, 60,  1, 61, 53, 52, 57, 49,  6,  1, 71, 56, 57, 51, 56,  1, 61, 53,
        49, 62, 67,  1, 70, 53, 66, 73,  1, 60, 57, 68, 68, 60, 53,  6,  1, 57,
        54,  1, 49, 62, 73, 68, 56, 57, 62, 55,  6,  1, 69, 62, 60, 53, 67, 67,
         1, 57, 68,  5, 67,  1, 57, 62,  1, 68, 56, 53,  1, 51, 63, 62, 68, 53,
        72, 68,  1, 63, 54,  1, 64, 53, 63, 64, 60, 53,  1, 71, 56, 63,  1, 66,
        53, 49, 60, 60, 73,  1, 59, 62, 63, 71,  1, 53, 72, 53, 66, 51, 57, 67,
  

In [None]:
# Split into training and validation sets
train_size = int(0.9 * len(data))
val_data = data[train_size:]
train_data = data[:train_size]
print('Train size:', len(train_data))
print('Val size:', len(val_data))

Train size: 731231
Val size: 81248


In [114]:
block_size = 8
train_data[:block_size]

tensor([23, 62, 52, 66, 53, 71,  1, 30])

In [115]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([23]) the target is 62
when input is tensor([23, 62]) the target is 52
when input is tensor([23, 62, 52]) the target is 66
when input is tensor([23, 62, 52, 66]) the target is 53
when input is tensor([23, 62, 52, 66, 53]) the target is 71
when input is tensor([23, 62, 52, 66, 53, 71]) the target is 1
when input is tensor([23, 62, 52, 66, 53, 71,  1]) the target is 30
when input is tensor([23, 62, 52, 66, 53, 71,  1, 30]) the target is 69


In [122]:
torch.manual_seed(47)
batch_size = 4 # number of sequences in a batch, processed in parallel
block_size = 8 # length of a sequence, max context length for predictions

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y

xb, yb = get_batch('train') 
print('Inputs: ', xb.shape)
print(xb)
print('Targets: ', yb.shape)
print(yb)

print('------')


for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target is {target}")
    


Inputs:  torch.Size([4, 8])
tensor([[63, 66, 59,  1, 63, 66,  1, 67],
        [68, 63,  1, 52, 63,  1, 68, 56],
        [53,  1, 49,  1, 67, 53, 62, 67],
        [68, 53, 57, 62,  1, 49, 62, 52]])
Targets:  torch.Size([4, 8])
tensor([[66, 59,  1, 63, 66,  1, 67, 57],
        [63,  1, 52, 63,  1, 68, 56, 66],
        [ 1, 49,  1, 67, 53, 62, 67, 53],
        [53, 57, 62,  1, 49, 62, 52,  1]])
------
when input is [63] the target is 66
when input is [63, 66] the target is 59
when input is [63, 66, 59] the target is 1
when input is [63, 66, 59, 1] the target is 63
when input is [63, 66, 59, 1, 63] the target is 66
when input is [63, 66, 59, 1, 63, 66] the target is 1
when input is [63, 66, 59, 1, 63, 66, 1] the target is 67
when input is [63, 66, 59, 1, 63, 66, 1, 67] the target is 57
when input is [68] the target is 63
when input is [68, 63] the target is 1
when input is [68, 63, 1] the target is 52
when input is [68, 63, 1, 52] the target is 63
when input is [68, 63, 1, 52, 63] the targ

In [123]:
print(xb)

tensor([[63, 66, 59,  1, 63, 66,  1, 67],
        [68, 63,  1, 52, 63,  1, 68, 56],
        [53,  1, 49,  1, 67, 53, 62, 67],
        [68, 53, 57, 62,  1, 49, 62, 52]])


In [136]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocabulary_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)
        
    def forward(self, idx, targets=None):
        
        logits = self.token_embedding_table(idx)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C) # batch, time, channels
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generation(self, idx, max_new_tokens):
        # idx is (B, T) 
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last token/time step
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat([idx, idx_next], dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel(vocabulary_size)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss) # -ln(1/vocabulary_size) = -ln(1/76) = 4.3307

print(decode(model.generation(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 76])
tensor(4.9399, grad_fn=<NllLossBackward0>)

@IDh?MJoCJwVsrbMqbJq£gQMGUMFPAQVj?UpX£w0QpF.$xunu3BlM8qt8Vzy$9xJCPl2"MI'sxXY%I:SHbGR%q@D3U'd,w?dYzww


In [138]:
# Create PyTorch Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # lr=0.001)

In [None]:
batch_size = 32

for steps in range(10000):
    
    xb, yb = get_batch('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=None)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.434229612350464


In [148]:
print(decode(model.generation(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


And st? Hus the vefe cey e oreagor y mik a hin f w ypind o w chet t izou mike, I 24 axhacoubest s bit owass nd ciatho te fat, ry, lins nteimar wevevaft'rik 7n or ove. rart e me the s qu byboves m rimasthaindim Sorand swhe thenlss, s st dicestup: s o trig I cesof toitoulinc t m, wiboing t. s, t, tong


## GUI

In [45]:
import tkinter as tk

def generate_response(user_input):
    # Placeholder function for generating a response
    return "Bot: " + user_input[::-1] # We'll just reverse the input for now, and then put the variable name or method name to execute the response of the bot

def respond():
    user_input = entry.get()
    if user_input.strip() == "" or user_input == placeholder_text:
        return  # Ignore empty input or placeholder text
    response = generate_response(user_input)
    output.insert(tk.END, f"You: {user_input}\n{response}\n")
    entry.delete(0, tk.END)  # Clear the entry box
    window.focus()  # Shift focus to the main window to enable placeholder functionality
    add_placeholder()  # Re-add placeholder immediately

def add_placeholder():
    """Adds the placeholder text if the entry is empty."""
    if not entry.get():  # Check if the entry is empty
        entry.insert(0, placeholder_text)
        entry.config(fg="grey")

def remove_placeholder(event=None):
    """Removes the placeholder text when the user focuses on the entry."""
    if entry.get() == placeholder_text:
        entry.delete(0, tk.END)
        entry.config(fg="white")  # Switch text color for actual input

# Main window setup
window = tk.Tk()
window.title("Chatbot")
window.geometry("400x300")

placeholder_text = "Ask a question"

# Output textbox (aligned at the top)
output = tk.Text(window, wrap="word", height=15)
output.grid(row=0, column=0, columnspan=2, sticky="nsew", padx=10, pady=(10, 0))

# Entry box with placeholder
entry = tk.Entry(window, fg="grey")
entry.grid(row=1, column=0, sticky="ew", padx=(10, 5), pady=10)
entry.insert(0, placeholder_text)  # Add placeholder initially
entry.bind("<FocusIn>", remove_placeholder)
entry.bind("<FocusOut>", lambda event: add_placeholder())

# Submit button
submit = tk.Button(window, text="Ask", command=respond)
submit.grid(row=1, column=1, sticky="ew", padx=(5, 10), pady=10)

# Configure grid layout
window.grid_rowconfigure(0, weight=1)  # Let the textbox expand vertically
window.grid_columnconfigure(0, weight=1)  # Let the entry box expand horizontally

window.mainloop()


: 