# Notebook to play with dataset

In [72]:
from __future__ import print_function

import statistics as stats
from timeit import default_timer as timer

import matplotlib.pyplot as plt
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchbraid
import torchbraid.utils
from torchvision import datasets, transforms
import sys

from network_architecture import parse_args, ParallelNet
from mpi4py import MPI


In [73]:
## Partially taken from Karpathy's github: [url]

import os
import torch
# from transformers import GPT2Tokenizer, GPT2Model  <-- below

def obtain_data(data_dir, input_text, tokenization):
  data_path = os.path.join(data_dir, input_text + '.txt')

  print('1.1 Reading text')
  with open(data_path, 'r', encoding='utf-8') as f:
      text = f.read()

  if tokenization == 'character':
    print('1.2 Building character-level tokenizer')
    # here are all the unique characters that occur in this text
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    # create a mapping from characters to integers
    stoi = { ch:i for i,ch in enumerate(chars) }
    itos = { i:ch for i,ch in enumerate(chars) }
    encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
    decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

    print('1.3 Encoding data')
    data = torch.tensor(encode(text), dtype=torch.long)

  elif tokenization == 'gpt2':
    from transformers import GPT2Tokenizer

    print('1.2 Obtaining gpt2 tokenizer')
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-tokenizer')
    # tokenizer.pad_token = '<pad>'
    decode = tokenizer.decode
    vocab_size = tokenizer.vocab_size

    print('1.3 Encoding data')
    data = tokenizer(text)['input_ids']
    data = torch.tensor(data, dtype=torch.long)
    print(data.shape)

  else: raise Exception()

  print('1.4 Splitting data into training and validation data')
  n = int(.9*len(data))
  train_data, val_data = data[:n], data[n:]

  return train_data, val_data, decode, vocab_size

In [74]:
train_data, val_data, decode, vocabulary_size = \
    obtain_data('.', 'shakespeare', 'gpt2')


1.1 Reading text
1.2 Obtaining gpt2 tokenizer
1.3 Encoding data


Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


torch.Size([338025])
1.4 Splitting data into training and validation data


In [75]:
decode(val_data[-30:-1])

"\nNoble Sebastian,\nThou let'st thy fortune sleep--die, rather; wink'st\nWhiles thou art waking."

In [76]:
train_data

tensor([ 5962, 22307,    25,  ...,  3398,  9399,    25])

In [78]:
def get_batch(data, context_window, batch_size, device):
  ix = torch.randint(len(data) - context_window, (batch_size,))
  x = torch.stack([data[i : i + context_window] for i in ix])
  y = torch.stack([data[i+1 : i+1 + context_window] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

x, y = get_batch(train_data, 256, 32, 'cuda')

In [79]:
x.shape

torch.Size([32, 256])

# Okay I think the data is correct. 

In [53]:
from torch.utils.data import Dataset, DataLoader


In [95]:
context_window = 256
batch_size = 32

# Total number of batches; we should decrease by 1 just in case for shifting
num_samples_per_epoch_train = context_window * (len(train_data) // context_window - 1) 
num_samples_per_epoch_val = context_window * (len(val_data) // context_window - 1) 

train_data[0:num_samples_per_epoch_train].reshape((context_window, -1))


class TextDataset(Dataset):
    def __init__(self, data, context_window = 256):
        self.length = len(data) // context_window - 1

        self.data = data
        self.context_window = context_window

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.data[idx * self.context_window:(idx + 1) * self.context_window], \
                self.data[1 + idx * self.context_window:1 + (idx + 1) * self.context_window]



In [96]:
train_dataset = TextDataset(train_data)

In [97]:
print(len(train_dataset))

1187


In [98]:
x, y = train_dataset[10]

In [99]:
print(x.shape)

torch.Size([256])


In [100]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

In [50]:
x[0].shape

torch.Size([256])

In [28]:
train_data[0:
    context_window * (len(train_data) // context_window)
].reshape((context_window, -1))

tensor([[ 5962, 22307,    25,  ...,    11,   198,  9590],
        [ 6508, 27794,   262,  ...,  1867,   338,   511],
        [ 6095,    30,   198,  ...,    32,  1295,  2174],
        ...,
        [  314, 12472,    13,  ...,     0, 44012, 13676],
        [  428, 35831,   594,  ...,   198,  1135,   481],
        [  467,  2513,   257,  ...,   345, 12891,    25]])

In [18]:
len(train_data[0:
    context_window * len(train_data) // context_window
])

304222

In [26]:
context_window * (len(train_data) // context_window)

304128

In [27]:
304128 / context_window

1188.0

# Take care of wiki dataset

In [40]:
from transformers import GPT2TokenizerFast
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-tokenizer')
# tokenzer.pad_token = '<pad>'
decode = tokenizer.decode
vocab_size = tokenizer.vocab_size


In [41]:
from tqdm.notebook import tqdm

In [46]:
data = []
counter = 0 
# Counted number of lines on linux to be 2966378
with open('../data/wikipedia.txt', 'r', encoding='utf-8') as f:
  # Use tqdm to iterate through lines with a description
  for line in tqdm(f, desc="Tokenizing Wikipedia", total=2966378):
    text = line.strip()  # Strip whitespace from each line

    # Check if it's a blank line (after stripping)
    if not text:
      continue

    # Tokenize and process the text
    data += tokenizer(text)['input_ids']
    # counter += 1
    # print(text)

    # if counter == 7: 
    #     break


Tokenizing Wikipedia:   0%|          | 0/2966378 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1059 > 1024). Running this sequence through the model will result in indexing errors


In [49]:
decode(data[0:100])

'= Valkyria Chronicles III =Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3, lit. Valkyria of the Battlefield 3 ), commonly referred to as Valkyria Chronicles III outside Japan, is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable. Released in January 2011 in Japan, it is the third game in the Valkyria'