<a href="https://colab.research.google.com/github/MuhammedUmerNazir/NLP_MATH_GPT/blob/main/NLP_MATH_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://people.eecs.berkeley.edu/~hendrycks/MATH.tar

--2023-05-21 07:01:41--  https://people.eecs.berkeley.edu/~hendrycks/MATH.tar
Resolving people.eecs.berkeley.edu (people.eecs.berkeley.edu)... 128.32.244.190
Connecting to people.eecs.berkeley.edu (people.eecs.berkeley.edu)|128.32.244.190|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20327936 (19M) [application/x-tar]
Saving to: ‘MATH.tar’


2023-05-21 07:01:44 (9.79 MB/s) - ‘MATH.tar’ saved [20327936/20327936]



In [2]:
#untar the dataset
!tar -xf MATH.tar

In [3]:
#install relevant dependencies
!pip install transformers torch torchvision numpy pandas tqdm natsort wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
Collecting wandb
  Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.6 MB/s[0m e

In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
import glob
import os
import numpy as np
import pandas as pd
import json
from torch.optim import Adam
from tqdm import tqdm
from natsort import natsorted # for natural sort order (just for debugging)
    
import wandb



In [10]:
wandb.login(key = "3f4303f9afca2d65e2496142599869bef467a34a")

[34m[1mwandb[0m: Currently logged in as: [33mm-umernazir17[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [11]:
wandb.init(project="math dataset gpt2") # initialize wandb for logging

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"  # device

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # tokenizer for GPT2
tokenizer.add_special_tokens({  # special tokens that we will add to the dataset
    "pad_token": "<pad>",  # for padding 
    "bos_token": "<startofstring>",  # beginning of string
    "eos_token": "<endofstring>",  # end of string
})

model = GPT2LMHeadModel.from_pretrained("gpt2")  # pretrained model
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
class MathDataset(Dataset):
  def __init__(self, base_dir, tokenizer):
    self.tokenizer = tokenizer
    self.filelist = natsorted(glob.glob(base_dir + "/*/*.json"))
    self.filelist = self.filelist[:round(0.2*len(self.filelist))]  # loading a subset (20) of the whole dataset
    self.problems = []
    self.solutions = []
    for filename in self.filelist:  # load all the problems and their solutions
      with open(filename, "r") as f:
        tt = f.read()
      file_json = json.loads(tt)
      self.problems.append(file_json["problem"])
      self.solutions.append(file_json["solution"])

  def __len__(self):
    return len(self.problems)

  def __getitem__(self, index):  # load one file
    input = self.problems[index]  
    label = self.solutions[index]

    input = "<startofstring>"+self.problems[index]+"<endofstring>"
    input_encoded = self.tokenizer(input, truncation=True, padding="max_length", max_length=1000,  return_tensors="pt")  # encode with GPT2 Tokenizer and return pytorch tensors
    input_id = input_encoded["input_ids"]
    input_attention_mask = input_encoded["attention_mask"]
    
    output = "<startofstring>"+self.solutions[index]+"<endofstring"
    output_encoded = self.tokenizer(input, truncation=True, padding="max_length", max_length=1000,  return_tensors="pt")
    output_id = output_encoded["input_ids"]
    output_attention_mask = output_encoded["attention_mask"]

    return {"input_id": input_id, "attention_mask": input_attention_mask, "label": output_id}

In [15]:
optimizer = Adam(model.parameters(), lr=1e-3)
batch_size = 4
max_epochs = 50

In [16]:
# create data loaders
train_dataset = MathDataset("/content/MATH/train", tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = MathDataset("/content/MATH/test", tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [17]:
best_loss = np.inf
val_step = 2  # perform validation after val_step number of epochs

for epoch in range(max_epochs):
    train_loss = 0
    train_total_loss = 0
    train_total_tokens = 0
    
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        X = batch["input_id"].to(device)
        a = batch["attention_mask"].to(device)
        y = batch["label"].to(device)
        loss = model(X, attention_mask=a, labels=y).loss
        wandb.log({"train_loss": loss.item()})  # log to wandb
        train_loss += loss.item()
        
        train_total_loss += loss.item() * X.size(0)
        train_total_tokens += torch.sum(a).item()
        
        loss.backward()
        optimizer.step()
    
    train_loss = train_loss / len(train_loader)  # average loss per batch
    train_avg_loss = train_total_loss / train_total_tokens
    train_perplexity = torch.exp(torch.tensor(train_avg_loss)).item()
    wandb.log({"train_perplexity": train_perplexity})
    
    if epoch % val_step == 0:
        model.eval()
        with torch.no_grad():
            val_loss = 0
            val_total_loss = 0
            val_total_tokens = 0
            
            for batch in tqdm(val_loader):
                X = batch["input_id"].to(device)
                a = batch["attention_mask"].to(device)
                y = batch["label"].to(device)
                loss = model(X, attention_mask=a, labels=y).loss
                wandb.log({"val_loss": loss.item()})
                val_loss += loss.item()
                
                val_total_loss += loss.item() * X.size(0)
                val_total_tokens += torch.sum(a).item()
            
            val_loss = val_loss / len(val_loader)
            val_avg_loss = val_total_loss / val_total_tokens
            val_perplexity = torch.exp(torch.tensor(val_avg_loss)).item()
            wandb.log({"val_perplexity": train_perplexity})
            
            if val_loss < best_loss:
                torch.save(model, "math_gpt2_best.pt")
                wandb.save("math_gpt2_best.pt")
                best_loss = val_loss
                print(f"new model saved at epoch {epoch}")
                
            print(f"epoch: {epoch} | train loss: {round(train_loss, 4)} | train perplexity: {round(train_perplexity, 4)} | val loss: {round(val_loss, 4)} | val perplexity: {round(val_perplexity, 4)}")
    else:
        print(f"epoch: {epoch} | train loss: {round(train_loss, 4)} | train perplexity: {round(train_perplexity, 4)}")

100%|██████████| 375/375 [08:14<00:00,  1.32s/it]
100%|██████████| 250/250 [01:53<00:00,  2.21it/s]


new model saved at epoch 0
epoch: 0 | train loss: 0.6452 | train perplexity: 1.0108 | val loss: 0.2158 | val perplexity: 1.004


100%|██████████| 375/375 [08:18<00:00,  1.33s/it]


epoch: 1 | train loss: 0.2054 | train perplexity: 1.0034


100%|██████████| 375/375 [08:11<00:00,  1.31s/it]
100%|██████████| 250/250 [01:53<00:00,  2.20it/s]


new model saved at epoch 2
epoch: 2 | train loss: 0.1468 | train perplexity: 1.0025 | val loss: 0.1396 | val perplexity: 1.0026


100%|██████████| 375/375 [08:10<00:00,  1.31s/it]


epoch: 3 | train loss: 0.1102 | train perplexity: 1.0018


100%|██████████| 375/375 [08:09<00:00,  1.30s/it]
100%|██████████| 250/250 [01:53<00:00,  2.20it/s]


new model saved at epoch 4
epoch: 4 | train loss: 0.0856 | train perplexity: 1.0014 | val loss: 0.1275 | val perplexity: 1.0024


100%|██████████| 375/375 [08:09<00:00,  1.30s/it]


epoch: 5 | train loss: 0.0706 | train perplexity: 1.0012


100%|██████████| 375/375 [08:10<00:00,  1.31s/it]
100%|██████████| 250/250 [01:53<00:00,  2.20it/s]


epoch: 6 | train loss: 0.0571 | train perplexity: 1.001 | val loss: 0.1454 | val perplexity: 1.0027


100%|██████████| 375/375 [08:09<00:00,  1.30s/it]


epoch: 7 | train loss: 0.0491 | train perplexity: 1.0008


100%|██████████| 375/375 [08:09<00:00,  1.31s/it]
100%|██████████| 250/250 [01:54<00:00,  2.19it/s]


epoch: 8 | train loss: 0.0425 | train perplexity: 1.0007 | val loss: 0.1552 | val perplexity: 1.0029


100%|██████████| 375/375 [08:10<00:00,  1.31s/it]


epoch: 9 | train loss: 0.0377 | train perplexity: 1.0006


100%|██████████| 375/375 [08:10<00:00,  1.31s/it]
100%|██████████| 250/250 [01:54<00:00,  2.19it/s]


epoch: 10 | train loss: 0.0342 | train perplexity: 1.0006 | val loss: 0.175 | val perplexity: 1.0033


100%|██████████| 375/375 [08:09<00:00,  1.31s/it]


epoch: 11 | train loss: 0.0316 | train perplexity: 1.0005


100%|██████████| 375/375 [08:15<00:00,  1.32s/it]
100%|██████████| 250/250 [01:52<00:00,  2.23it/s]


epoch: 12 | train loss: 0.2926 | train perplexity: 1.0049 | val loss: 0.2564 | val perplexity: 1.0048


  5%|▍         | 17/375 [00:23<08:22,  1.40s/it]


KeyboardInterrupt: ignored

In [18]:
def infer(input):
  tokens = tokenizer("<startofstring>"+input+"<endofstring>", truncation=True, padding="max_length", max_length=1000,  return_tensors="pt").to(device)
  output = model.generate(**tokens, max_length=1001)
  output = tokenizer.decode(output[0])
  return output

In [19]:
wandb.finish()

VBox(children=(Label(value='486.799 MB of 486.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.99…

0,1
train_loss,▃▃▃▂▃▂▂▂▅█▁▁▂▂▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▅▂▄
train_perplexity,█▃▂▂▂▁▁▁▁▁▁▁▄
val_loss,▄▄▃▃▃▃▁▃▂▁▁▄▂▂▃▂▂▂▁▃▄▃▃▂▂▃▂▂▁█▆▄▂▂▃▇▃▃▅▅
val_perplexity,█▂▂▁▁▁▄

0,1
train_loss,0.19566
train_perplexity,1.0049
val_loss,0.15617
val_perplexity,1.0049


In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
import os

#set the working directory 

root_dir = "/content/drive/My Drive/"

#choose the project folder
project_folder = "Colab Notebooks/Project_folder/"

#define a function to create and set the working directory
def create_and_set_working_directory(project_folder):
  #check if the project folder exists. if not, make one.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + 'did not existed and was created.')
  
  #change the OS path to project folder as working directory
  os.chdir(root_dir + project_folder)

  #create a test file in the working directory and see if it shows up at the right place
  !touch 'new_file_test.txt'
  print('working directory' + root_dir + project_folder + \
        "empty text file created. You can also run !pwd command to confirm working directory.")

create_and_set_working_directory(project_folder)



/content/drive/My Drive/Colab Notebooks/Project_folder/did not existed and was created.
working directory/content/drive/My Drive/Colab Notebooks/Project_folder/empty text file created. You can also run !pwd command to confirm working directory.


In [22]:
ls

new_file_test.txt


In [23]:
os.chdir('/content')

In [24]:
ls

[0m[01;34mdrive[0m/  [01;34mMATH[0m/  math_gpt2_best.pt  MATH.tar  [01;34msample_data[0m/  [01;34mwandb[0m/


In [27]:
!cp math_gpt2_best.pt MATH.tar "/content/drive/My Drive/Colab Notebooks/Project_folder/"

In [28]:
!cp -r wandb MATH "/content/drive/My Drive/Colab Notebooks/Project_folder/"

In [29]:
os.chdir(root_dir + project_folder)

In [30]:
ls

[0m[01;34mMATH[0m/  math_gpt2_best.pt  MATH.tar  new_file_test.txt  [01;34mwandb[0m/
