In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/double_degree/nlp/T-725-MALV-Natural-Language-Processing/assignments/assignment_2/section_3

/content/drive/MyDrive/double_degree/nlp/T-725-MALV-Natural-Language-Processing/assignments/assignment_2/section_3


In [None]:
! ls -als

In [None]:
! git pull

# Extract Data from dataset and create corpus

Use https://convokit.cornell.edu/ as corpus to build a chatbot.<br/>
Documentation: https://convokit.cornell.edu/documentation/corpus.html

In [None]:
! pip install convokit

In [None]:
import nltk

nltk.download('punkt')
from convokit import Corpus, download

In [None]:
supreme_corpus = Corpus(filename=download('supreme-corpus'))
friends_corpus = Corpus(filename=download('friends-corpus'))
parliament_corpus = Corpus(filename=download("parliament-corpus"))

Downloading supreme-corpus to /root/.convokit/downloads/supreme-corpus
Downloading supreme-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-corpus.zip (1255.8MB)... Done
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem
Downloading friends-corpus to /root/.convokit/downloads/friends-corpus
Downloading friends-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/friends-corpus/friends-corpus.zip (6.1MB)... Done
Downloading parliament-corpus to /root/.convokit/downloads/parliament-corpus
Downloading parliament-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/parliament-corpus/parliament-corpus.zip (368.2MB)... Done


In [None]:
def save_corpus(corpus: Corpus, save_path: str):
    corpus.print_summary_stats()
    with open(save_path, 'w+') as f:
        total_conversation_len = []
        for conversation in corpus.iter_conversations():
            conversation_length = 0
            for utt in conversation.iter_utterances():
                if utt.text.strip() != '':
                    sentence = utt.speaker.id + ': ' + utt.text + '\n'
                    f.write(sentence)
                    conversation_length += len(sentence)
            total_conversation_len.append(conversation_length)
            f.write('\n')
    return sum(total_conversation_len)/len(total_conversation_len)

In [None]:
avg_friends_dialogue_len = save_corpus(friends_corpus, 'personal_corpus/friends.txt')
avg_parliament_dialogue_len = save_corpus(parliament_corpus, 'personal_corpus/parliament.txt')
avg_supreme_dialogue_len = save_corpus(supreme_corpus, 'personal_corpus/supreme.txt')

Number of Speakers: 700
Number of Utterances: 67373
Number of Conversations: 3107
Number of Speakers: 1978
Number of Utterances: 433787
Number of Conversations: 216894
Number of Speakers: 8979
Number of Utterances: 1700789
Number of Conversations: 7817


In [None]:
! wc -lc personal_corpus/friends.txt
! wc -lc personal_corpus/supreme.txt
! wc -lc personal_corpus/parliament.txt

  64417 4130533 personal_corpus/friends.txt
  3824952 438125462 personal_corpus/supreme.txt
   650681 194258984 personal_corpus/parliament.txt


In [None]:
# releasing some memory
del friends_corpus
del parliament_corpus
del supreme_corpus

In [32]:
# Hardcoded version, to avoid computing again the parameters
if 'avg_friends_dialogue_len' not in locals():
  avg_friends_dialogue_len = 1328.4280656581911
if 'avg_parliament_dialogue_len' not in locals():
  avg_parliament_dialogue_len = 893.0366861231754
if 'avg_supreme_dialogue_len' not in locals():
  avg_supreme_dialogue_len = 56007.29346296533

In [None]:
print("Friends avg dialogue len", avg_friends_dialogue_len)
print("Parliament avg dialogue len", avg_parliament_dialogue_len)
print("Supreme Court avg dialogue len", avg_supreme_dialogue_len)

Friends avg dialogue len 1328.4280656581911
Parliament avg dialogue len 893.0366861231754
Supreme Court avg dialogue len 56007.29346296533


In [33]:
import math
nearest_power_of_two_friends_dialogue_len = 2**math.floor(math.log2(avg_friends_dialogue_len))
nearest_power_of_two_parliament_dialogue_len = 2**math.floor(math.log2(avg_parliament_dialogue_len))
nearest_power_of_two_supreme_dialogue_len = 2**math.floor(math.log2(avg_supreme_dialogue_len))

In [None]:
print("Nearest Friends avg dialogue len", nearest_power_of_two_friends_dialogue_len)
print("Nearest Parliament avg dialogue len", nearest_power_of_two_parliament_dialogue_len)
print("Nearest Supreme Court avg dialogue len", nearest_power_of_two_supreme_dialogue_len)

Nearest Friends avg dialogue len 1024
Nearest Parliament avg dialogue len 512
Nearest Supreme Court avg dialogue len 32768


# Starting Training/Testing part

In [44]:
def perform_train_extract_sample(prepare_path: str, max_new_tokens=None, **kwargs):
    def _kwargs_to_str(**kwargs):
        out = ' '.join([f'--{key}={value}' for key, value in kwargs.items()])
        return out

    if 'compile' not in kwargs.keys():
        kwargs['compile'] = False
    if 'eval_iters' not in kwargs.keys():
        kwargs['eval_iters'] = 20
    if 'log_interval' not in kwargs.keys():
        kwargs['log_interval'] = 1
    if 'block_size' not in kwargs.keys():
        kwargs['block_size'] = 64
    if 'batch_size' not in kwargs.keys():
        kwargs['batch_size'] = 12
    if 'n_layer' not in kwargs.keys():
        kwargs['n_layer'] = 4
    if 'n_head' not in kwargs.keys():
        kwargs['n_head'] = 4
    if 'n_embd' not in kwargs.keys():
        kwargs['n_embd'] = 128
    if 'max_iters' not in kwargs.keys():
        kwargs['max_iters'] = 2000
    if 'lr_decay_iters' not in kwargs.keys():
        kwargs['lr_decay_iters'] = 2000
    if 'dropout' not in kwargs.keys():
        kwargs['dropout'] = 0.0

    kwargs_str = _kwargs_to_str(**kwargs)

    ! python data/{prepare_path}/prepare.py
    ! python train.py config/train_{prepare_path}.py {kwargs_str}
    try:
        import tiktoken
    except:
        ! pip install tiktoken
    ! python sample.py --out_dir=out_{prepare_path} {'--max_new_tokens=' + str(max_new_tokens) if max_new_tokens is not None else ''}

## Default Parameters:

In [42]:
perform_train_extract_sample('friends_char')

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [5]:
perform_train_extract_sample('parliament_char')

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [6]:
perform_train_extract_sample('supreme_char')

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.2799
  - Validation Loss: 1.3709
- Parliament:
  - Train Loss: 1.4926
  - Validation Loss: 1.5541
- Supreme Court:
  - Train Loss: 1.5803
  - Validation Loss: 1.6075

## Increasing Iterations Number:

In [4]:
perform_train_extract_sample('friends_char', max_iters=4000)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [5]:
perform_train_extract_sample('parliament_char', max_iters=4000)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [6]:
perform_train_extract_sample('supreme_char', max_iters=4000)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.2934
  - Validation Loss: 1.3602
- Parliament:
  - Train Loss: 1.4199
  - Validation Loss: 1.4212
- Supreme Court:
  - Train Loss: 1.4577
  - Validation Loss: 1.5110

## Increasing Iterations Number (again):

In [13]:
perform_train_extract_sample('friends_char', max_iters=8000)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
iter 3110: loss 1.5077, time 9.40ms, mfu 0.13%
iter 3111: loss 1.2722, time 9.73ms, mfu 0.13%
iter 3112: loss 1.1705, time 9.25ms, mfu 0.13%
iter 3113: loss 1.3487, time 9.23ms, mfu 0.13%
iter 3114: loss 1.4331, time 9.57ms, mfu 0.13%
iter 3115: loss 1.3165, time 9.32ms, mfu 0.13%
iter 3116: loss 1.3067, time 9.99ms, mfu 0.13%
iter 3117: loss 1.3523, time 9.21ms, mfu 0.13%
iter 3118: loss 1.3133, time 9.41ms, mfu 0.13%
iter 3119: loss 1.2790, time 9.28ms, mfu 0.13%
iter 3120: loss 1.5437, time 9.42ms, mfu 0.13%
iter 3121: loss 1.2943, time 10.47ms, mfu 0.13%
iter 3122: loss 1.2241, time 9.40ms, mfu 0.13%
iter 3123: loss 1.4211, time 9.29ms, mfu 0.13%
iter 3124: loss 1.2886, time 11.07ms, mfu 0.13%
iter 3125: loss 1.3518, time 10.67ms, mfu 0.13%
iter 3126: loss 1.3428, time 9.45ms, mfu 0.13%
iter 3127: loss 1.3257, time 9.38ms, mfu 0.13%
iter 3128: loss 1.2603, time 9.46ms, mfu 0.13%
iter 3129: loss 1.4031, time 9.63ms, mfu

In [14]:
perform_train_extract_sample('parliament_char', max_iters=8000)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
iter 3068: loss 1.4298, time 9.43ms, mfu 0.13%
iter 3069: loss 1.4521, time 9.31ms, mfu 0.13%
iter 3070: loss 1.5576, time 9.92ms, mfu 0.13%
iter 3071: loss 1.3216, time 9.85ms, mfu 0.13%
iter 3072: loss 1.4296, time 10.24ms, mfu 0.13%
iter 3073: loss 1.4231, time 9.62ms, mfu 0.13%
iter 3074: loss 1.4759, time 9.75ms, mfu 0.13%
iter 3075: loss 1.5012, time 12.14ms, mfu 0.13%
iter 3076: loss 1.5058, time 15.48ms, mfu 0.12%
iter 3077: loss 1.3546, time 9.97ms, mfu 0.12%
iter 3078: loss 1.3904, time 10.04ms, mfu 0.12%
iter 3079: loss 1.5504, time 10.34ms, mfu 0.12%
iter 3080: loss 1.3601, time 9.78ms, mfu 0.13%
iter 3081: loss 1.4302, time 9.53ms, mfu 0.13%
iter 3082: loss 1.5254, time 9.88ms, mfu 0.13%
iter 3083: loss 1.4947, time 9.52ms, mfu 0.13%
iter 3084: loss 1.5865, time 9.53ms, mfu 0.13%
iter 3085: loss 1.4491, time 11.21ms, mfu 0.13%
iter 3086: loss 1.4157, time 14.26ms, mfu 0.12%
iter 3087: loss 1.4970, time 11.98ms

In [15]:
perform_train_extract_sample('supreme_char', max_iters=8000)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
iter 3084: loss 1.4609, time 10.51ms, mfu 0.13%
iter 3085: loss 1.5057, time 9.71ms, mfu 0.13%
iter 3086: loss 1.4716, time 9.97ms, mfu 0.13%
iter 3087: loss 1.4115, time 9.62ms, mfu 0.13%
iter 3088: loss 1.5866, time 9.81ms, mfu 0.13%
iter 3089: loss 1.4633, time 9.89ms, mfu 0.13%
iter 3090: loss 1.6621, time 10.17ms, mfu 0.13%
iter 3091: loss 1.6163, time 9.70ms, mfu 0.13%
iter 3092: loss 1.3417, time 10.34ms, mfu 0.13%
iter 3093: loss 1.5681, time 10.16ms, mfu 0.13%
iter 3094: loss 1.5552, time 10.81ms, mfu 0.13%
iter 3095: loss 1.4127, time 9.86ms, mfu 0.13%
iter 3096: loss 1.5481, time 14.29ms, mfu 0.12%
iter 3097: loss 1.4162, time 10.63ms, mfu 0.12%
iter 3098: loss 1.5568, time 10.23ms, mfu 0.12%
iter 3099: loss 1.3754, time 10.00ms, mfu 0.12%
iter 3100: loss 1.5145, time 10.11ms, mfu 0.12%
iter 3101: loss 1.4563, time 10.13ms, mfu 0.12%
iter 3102: loss 1.5431, time 10.00ms, mfu 0.12%
iter 3103: loss 1.5219, time 15

Results:
- Friends:
  - Train Loss: 1.2934
  - Validation Loss: 1.3602
- Parliament:
  - Train Loss: 1.4199
  - Validation Loss: 1.4212
- Supreme Court:
  - Train Loss: 1.4577
  - Validation Loss: 1.5110

## Increasing Iterations Number and Dropout Rate:

In [10]:
perform_train_extract_sample('friends_char', dropout=0.5, max_iters=4000)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [11]:
perform_train_extract_sample('parliament_char', dropout=0.5, max_iters=4000)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [12]:
perform_train_extract_sample('supreme_char', dropout=0.5, max_iters=4000)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.8903
  - Validation Loss: 1.9501
- Parliament:
  - Train Loss: 2.1667
  - Validation Loss: 2.1701
- Supreme Court:
  - Train Loss: 2.1789
  - Validation Loss: 2.2104

## Increasing Embedding Number, Layer Number and Interations Number:

In [7]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [8]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [9]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.1975
  - Val Loss: 1.2273
- Parliament:
  - Train Loss: 1.2480
  - Val Loss: 1.2934
- Supreme Court:
  - Train Loss: 1.3529
  - Val Loss: 1.3744


## Increasing Embeddings Number, Layer Number, Iterations Number, Attenton Heads Number:

In [16]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [17]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [18]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.2059
  - Validation Loss: 1.2001
- Parliament:
  - Train Loss: 1.2842
  - Validation Loss: 1.2966
- Supreme Court:
  - Train Loss: 1.3423
  - Val Loss: 1.3572

## Increasing Block Size and Batch Size:

In [19]:
perform_train_extract_sample('friends_char', block_size=128, batch_size=24)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [20]:
perform_train_extract_sample('parliament_char', block_size=128, batch_size=24)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [21]:
perform_train_extract_sample('supreme_char', block_size=128, batch_size=24)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.1944
  - Validation Loss: 1.2431
- Parliament:
  - Train Loss: 1.2844
  - Validation Loss: 1.2800
- Supreme Court:
  - Train Loss: 1.3216
  - Validation Loss: 1.3746

## Increasing Block Size, Batch Size, Embedding Number, Layer Number, Iterations Number:

In [25]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256, block_size=128, batch_size=24)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [26]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256, block_size=128, batch_size=24)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [27]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256, block_size=128, batch_size=24)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.0147
  - Val Loss: 1.0715
- Parliament:
  - Train Loss: 1.0312
  - Val Loss: 1.0488
- Supreme Court:
  - Train Loss: 1.1081
  - Val Loss: 1.1598

## Increasing Block Size, Batch Size, Embedding Number, Layer Number, Iterations Number, Attention Heads Number:

In [22]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8, block_size=128, batch_size=24)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [23]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8, block_size=128, batch_size=24)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [24]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8, block_size=128, batch_size=24)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.0119
  - Validation Loss: 1.0482
- Parliament:
  - Train Loss: 1.0345
  - Validation Loss: 1.0453
- Supreme Court:
  - Train Loss: 1.0863
  - Validation Loss: 1.1745

## Increasing Block Size, Batch Size, Embedding Number, Layer Number, Iterations Number (again):

In [46]:
perform_train_extract_sample('friends_char', max_new_tokens=nearest_power_of_two_friends_dialogue_len, max_iters=4000, n_layer=7, n_embd=256, block_size=nearest_power_of_two_friends_dialogue_len, batch_size=48)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [47]:
perform_train_extract_sample('parliament_char', max_new_tokens=nearest_power_of_two_parliament_dialogue_len, max_iters=4000, n_layer=7, n_embd=256, block_size=nearest_power_of_two_parliament_dialogue_len, batch_size=48)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [45]:
perform_train_extract_sample('supreme_char', max_new_tokens=nearest_power_of_two_supreme_dialogue_len, max_iters=4000, n_layer=7, n_embd=256, block_size=nearest_power_of_two_supreme_dialogue_len, batch_size=48)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =