In [16]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [17]:
%cd /content/drive/MyDrive/double_degree/nlp/T-725-MALV-Natural-Language-Processing/assignments/assignment_2/section_3

/content/drive/MyDrive/double_degree/nlp/T-725-MALV-Natural-Language-Processing/assignments/assignment_2/section_3


In [None]:
! ls -als

In [None]:
! git pull

# Extract Data from dataset and create corpus

Use https://convokit.cornell.edu/ as corpus to build a chatbot.<br/>
Documentation: https://convokit.cornell.edu/documentation/corpus.html

In [None]:
! pip install convokit

In [None]:
import nltk

nltk.download('punkt')
from convokit import Corpus, download

In [None]:
supreme_corpus = Corpus(filename=download('supreme-corpus'))
friends_corpus = Corpus(filename=download('friends-corpus'))
parliament_corpus = Corpus(filename=download("parliament-corpus"))

Downloading supreme-corpus to /root/.convokit/downloads/supreme-corpus
Downloading supreme-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-corpus.zip (1255.8MB)... Done
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem
Downloading friends-corpus to /root/.convokit/downloads/friends-corpus
Downloading friends-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/friends-corpus/friends-corpus.zip (6.1MB)... Done
Downloading parliament-corpus to /root/.convokit/downloads/parliament-corpus
Downloading parliament-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/parliament-corpus/parliament-corpus.zip (368.2MB)... Done


In [None]:
def save_corpus(corpus: Corpus, save_path: str):
    corpus.print_summary_stats()
    with open(save_path, 'w+') as f:
        total_conversation_len = []
        for conversation in corpus.iter_conversations():
            conversation_length = 0
            for utt in conversation.iter_utterances():
                if utt.text.strip() != '':
                    sentence = utt.speaker.id + ': ' + utt.text + '\n'
                    f.write(sentence)
                    conversation_length += len(sentence)
            total_conversation_len.append(conversation_length)
            f.write('\n')
    return sum(total_conversation_len)/len(total_conversation_len)

In [None]:
avg_friends_dialogue_len = save_corpus(friends_corpus, 'personal_corpus/friends.txt')
avg_parliament_dialogue_len = save_corpus(parliament_corpus, 'personal_corpus/parliament.txt')
avg_supreme_dialogue_len = save_corpus(supreme_corpus, 'personal_corpus/supreme.txt')

Number of Speakers: 700
Number of Utterances: 67373
Number of Conversations: 3107
Number of Speakers: 1978
Number of Utterances: 433787
Number of Conversations: 216894
Number of Speakers: 8979
Number of Utterances: 1700789
Number of Conversations: 7817


In [None]:
! wc -lc personal_corpus/friends.txt
! wc -lc personal_corpus/supreme.txt
! wc -lc personal_corpus/parliament.txt

  64417 4130533 personal_corpus/friends.txt
  3824952 438125462 personal_corpus/supreme.txt
   650681 194258984 personal_corpus/parliament.txt


In [None]:
# releasing some memory
del friends_corpus
del parliament_corpus
del supreme_corpus

In [None]:
# Hardcoded version, to avoid computing again the parameters
if 'avg_friends_dialogue_len' not in locals():
  avg_friends_dialogue_len = 1328.4280656581911
if 'avg_parliament_dialogue_len' not in locals():
  avg_parliament_dialogue_len = 893.0366861231754
if 'avg_supreme_dialogue_len' not in locals():
  avg_supreme_dialogue_len = 56007.29346296533

In [None]:
print("Friends avg dialogue len", avg_friends_dialogue_len)
print("Parliament avg dialogue len", avg_parliament_dialogue_len)
print("Supreme Court avg dialogue len", avg_supreme_dialogue_len)

Friends avg dialogue len 1328.4280656581911
Parliament avg dialogue len 893.0366861231754
Supreme Court avg dialogue len 56007.29346296533


In [None]:
import math
nearest_power_of_two_friends_dialogue_len = 2**math.floor(math.log2(avg_friends_dialogue_len))
nearest_power_of_two_parliament_dialogue_len = 2**math.floor(math.log2(avg_parliament_dialogue_len))
nearest_power_of_two_supreme_dialogue_len = 2**math.floor(math.log2(avg_supreme_dialogue_len))

In [None]:
print("Nearest Friends avg dialogue len", nearest_power_of_two_friends_dialogue_len)
print("Nearest Parliament avg dialogue len", nearest_power_of_two_parliament_dialogue_len)
print("Nearest Supreme Court avg dialogue len", nearest_power_of_two_supreme_dialogue_len)

Nearest Friends avg dialogue len 1024
Nearest Parliament avg dialogue len 512
Nearest Supreme Court avg dialogue len 32768


# Starting Training/Testing part

In [None]:
def perform_train_extract_sample(prepare_path: str, max_new_tokens=None,
                                 **kwargs):
    def _kwargs_to_str(**kwargs):
        out = ' '.join([f'--{key}={value}' for key, value in
                        kwargs.items()])
        return out

    if 'compile' not in kwargs.keys():
        kwargs['compile'] = False
    if 'eval_iters' not in kwargs.keys():
        kwargs['eval_iters'] = 20
    if 'log_interval' not in kwargs.keys():
        kwargs['log_interval'] = 1
    if 'block_size' not in kwargs.keys():
        kwargs['block_size'] = 64
    if 'batch_size' not in kwargs.keys():
        kwargs['batch_size'] = 12
    if 'n_layer' not in kwargs.keys():
        kwargs['n_layer'] = 4
    if 'n_head' not in kwargs.keys():
        kwargs['n_head'] = 4
    if 'n_embd' not in kwargs.keys():
        kwargs['n_embd'] = 128
    if 'max_iters' not in kwargs.keys():
        kwargs['max_iters'] = 2000
    if 'lr_decay_iters' not in kwargs.keys():
        kwargs['lr_decay_iters'] = 2000
    if 'dropout' not in kwargs.keys():
        kwargs['dropout'] = 0.0

    kwargs_str = _kwargs_to_str(**kwargs)

    ! python data/{prepare_path}/prepare.py
    ! python train.py config/train_{prepare_path}.py {kwargs_str}
    try:
        import tiktoken
    except:
        ! pip install tiktoken
    ! python sample.py --out_dir=out_{prepare_path} \
    {'--max_new_tokens=' + max_new_tokens if max_new_tokens is not None else ''}

## Run with default parameters:

In [None]:
perform_train_extract_sample('friends_char')

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [None]:
perform_train_extract_sample('parliament_char')

In [None]:
perform_train_extract_sample('supreme_char')

Results:
- Friends:
  - Train Loss: 1.3386
  - Val Loss: 1.3790
- Parliament:
  - Train Loss: 1.5115
  - Val Loss: 1.5358
- Supreme Court:
  - Train Loss: 1.5975
  - Val Loss: 1.6135

## Increasing Iterations number and Dropout rate:

In [None]:
perform_train_extract_sample('friends_char', dropout=0.5, max_iters=4000)

In [None]:
perform_train_extract_sample('parliament_char', dropout=0.5, max_iters=4000)

In [None]:
perform_train_extract_sample('supreme_char', dropout=0.5, max_iters=4000)

Results:
- Friends:
  - Train Loss: 1.9003
  - Val Loss: 1.8885
- Parliament:
  - Train Loss: 2.1533
  - Val Loss: 2.1757
- Supreme Court:
  - Train Loss: 2.2130
  - Val Loss: 2.2233

## Increasing Iterations number:

In [None]:
perform_train_extract_sample('friends_char', max_iters=4000)

In [None]:
perform_train_extract_sample('parliament_char', max_iters=4000)

In [None]:
perform_train_extract_sample('supreme_char', max_iters=4000)

Results:
- Friends:
  - Train Loss: 1.3182
  - Val Loss: 1.3161
- Parliament:
  - Train Loss: 1.4093
  - Val Loss: 1.4069
- Supreme Court:
  - Train Loss: 1.4783
  - Val Loss: 1.5146

Increasing Hidden Layers and Interations number:

In [None]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256)

In [None]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256)

In [None]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256)

Results:
- Friends:
  - Train Loss: 1.1975
  - Val Loss: 1.2273
- Parliament:
  - Train Loss: 1.2480
  - Val Loss: 1.2934
- Supreme Court:
  - Train Loss: 1.3529
  - Val Loss: 1.3744


## Increasing Embeddings Number, Layer Number, Iterations number, Attenton Heads Number:

In [None]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8)

In [None]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8)

In [None]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8)

Results:
- Friends:
  - Train Loss: 1.2020
  - Val Loss: 1.2263
- Parliament:
  - Train Loss: 1.2558
  - Val Loss: 1.2934
- Supreme Court:
  - Train Loss: 1.3423
  - Val Loss: 1.3572

## Increasing Block Size and Batch Size:

In [None]:
perform_train_extract_sample('friends_char', block_size=128, batch_size=24)

length of dataset in characters: 4,127,426
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,714,683 tokens
val has 412,743 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [None]:
perform_train_extract_sample('parliament_char', block_size=128, batch_size=24)

length of dataset in characters: 193,694,299
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,324,869 tokens
val has 19,369,430 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [None]:
perform_train_extract_sample('supreme_char', block_size=128, batch_size=24)

length of dataset in characters: 437,809,013
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,028,111 tokens
val has 43,780,902 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.1839
  - Val Loss: 1.2173
- Parliament:
  - Train Loss: 1.2596
  - Val Loss: 1.2652
- Supreme Court:
  - Train Loss: 1.3090
  - Val Loss: 1.3737

## Increasing Block Size, Batch Size, Embedding Number, Layer Number, Iterations Number, Attention Heads Number:

In [None]:
perform_train_extract_sample('friends_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8, block_size=128, batch_size=24)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [None]:
perform_train_extract_sample('parliament_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8, block_size=128, batch_size=24)

In [None]:
perform_train_extract_sample('supreme_char', max_iters=4000, n_layer=6, n_embd=256, n_head=8, block_size=128, batch_size=24)

length of dataset in characters: 437,816,830
all the unique characters: 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|} £§©½¾àáâçèéêíïñöüćń‑–—‘’“”…′″‵‶⋅〝ﬁ�
vocab size: 131
train has 394,035,147 tokens
val has 43,781,683 tokens
Overriding config with config/train_supreme_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_supreme_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'supreme_char'
wandb_run_name = 'mini-gpt'

dataset = 'supreme_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd =

Results:
- Friends:
  - Train Loss: 1.0147
  - Val Loss: 1.0715
- Parliament:
  - Train Loss: 1.0312
  - Val Loss: 1.0488
- Supreme Court:
  - Train Loss: 1.1081
  - Val Loss: 1.1598

## Increasing the same parameters more

In [None]:
perform_train_extract_sample('friends_char', max_new_tokens=nearest_power_of_two_friends_dialogue_len max_iters=4000, n_layer=7, n_embd=256, n_head=32, block_size=nearest_power_of_two_friends_dialogue_len, batch_size=48)

length of dataset in characters: 4,130,533
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}
vocab size: 89
train has 3,717,479 tokens
val has 413,054 tokens
Overriding config with config/train_friends_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_friends_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'friends_char'
wandb_run_name = 'mini-gpt'

dataset = 'friends_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with ba

In [None]:
perform_train_extract_sample('parliament_char', max_new_tokens=nearest_power_of_two_parliament_dialogue_len, max_iters=4000, n_layer=6, n_embd=256, n_head=32, block_size=nearest_power_of_two_parliament_dialogue_len, batch_size=48)

length of dataset in characters: 193,911,193
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{}¢£§°³·¹º¼½¾ÀÁÉÓÔÖÙÜßàáâäåæçèéêëíîïñòóôõöøùúüÿāćėęġśžǵȓȔʷ˙ߞߪ୰–—‖‘’“”•․…⁁€℄⅓⅔⅚⅛▸
vocab size: 171
train has 174,520,073 tokens
val has 19,391,120 tokens
Overriding config with config/train_parliament_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out_parliament_char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'parliament_char'
wandb_run_name = 'mini-gpt'

dataset = 'parliament_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters


In [None]:
perform_train_extract_sample('supreme_char', max_new_tokens=nearest_power_of_two_supreme_dialogue_len//2, max_iters=4000, n_layer=6, n_embd=256, n_head=32, block_size=nearest_power_of_two_supreme_dialogue_len//2, batch_size=48, device='cpu')