In [1]:
!pip install transformers tokenizers

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 6.7 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 51.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting un

In [2]:
# This allows direct access to files in Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gc

import torch
import numpy as np
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader

torch.cuda.memory_summary(device=None, abbreviated=None)

In [4]:
ITER = 7
DIVS = 10
root_folder = '/content/drive/MyDrive/EEP/TCC'
data_folder = f'{root_folder}/Data/python_data_03'
output_folder = f'{root_folder}/Model/GPT2/{ITER:02}'
model_folder = f'{root_folder}/Model/GPT2/{ITER-1:02}' if ITER > 0 else 'gpt2'
files = [f'data_{l:02}.txt' for l in range(1, 21)]

In [5]:
SPLIT = 0

# Load Dataset
full_data = open(f'{data_folder}/{files[ITER]}').read().splitlines()

split_data = np.array_split(full_data, DIVS)
ds_sequences = split_data[SPLIT]

len(ds_sequences)

40004

In [6]:
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'

tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2',
    bos_token = BOS_TOKEN,
    eos_token = EOS_TOKEN,
    pad_token = PAD_TOKEN
)

model = GPT2LMHeadModel.from_pretrained(model_folder).cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [None]:
class CodeDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=1024):

        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        for row in tqdm(dataset):
            encodings = tokenizer(
                BOS_TOKEN + row + EOS_TOKEN,
                truncation=True,
                max_length=max_length,
                padding='max_length'
            )
            self.input_ids.append(torch.tensor(encodings['input_ids']))
            self.attn_masks.append(torch.tensor(encodings['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

torch.cuda.memory_summary(device=None, abbreviated=None)

In [8]:
gpt_dataset = CodeDataset(ds_sequences, tokenizer)

100%|██████████| 40004/40004 [01:02<00:00, 635.61it/s]


In [9]:
print(gpt_dataset[0])

(tensor([50257,   220,   220,  ..., 50258, 50258, 50258]), tensor([1, 1, 1,  ..., 0, 0, 0]))


In [10]:
args = TrainingArguments(
    output_dir=output_folder,
    num_train_epochs=1,
    logging_steps=1000,
    save_steps=15000,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3
)

data_collator = lambda data: {
    'input_ids': torch.stack([f[0] for f in data]),
    'attention_mask': torch.stack([f[1] for f in data]),
    'labels': torch.stack([f[0] for f in data])
}

trainer = Trainer(
    args=args,
    model=model,
    data_collator=data_collator,
    train_dataset=gpt_dataset,
    tokenizer=tokenizer
)

In [11]:
del ds_sequences
gc.collect()

50

In [12]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=None)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 40004
  Num Epochs = 1
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 13335


Step,Training Loss
1000,0.2907
2000,0.2805
3000,0.2729
4000,0.2719
5000,0.2608
6000,0.262
7000,0.261
8000,0.2619
9000,0.261
10000,0.2563




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13335, training_loss=0.26401428577378755, metrics={'train_runtime': 8136.0038, 'train_samples_per_second': 4.917, 'train_steps_per_second': 1.639, 'total_flos': 2.0905452896256e+16, 'train_loss': 0.26401428577378755, 'epoch': 1.0})

In [15]:
trainer.save_model()

Saving model checkpoint to /content/drive/MyDrive/EEP/TCC/Model/GPT2/07
Configuration saved in /content/drive/MyDrive/EEP/TCC/Model/GPT2/07/config.json
Model weights saved in /content/drive/MyDrive/EEP/TCC/Model/GPT2/07/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/EEP/TCC/Model/GPT2/07/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/EEP/TCC/Model/GPT2/07/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/EEP/TCC/Model/GPT2/07/added_tokens.json
