In [1]:
import math_dataset 
from math_dataset import MathDatasetManager
import torch
import torch.optim as optim
from torch.utils import data
from math_dataset import (
    question_answer_to_position_batch_collate_fn
)
import model_process


import utils

%matplotlib notebook  

print("Torch Version", torch.__version__)

%load_ext autoreload
%autoreload 2

Torch Version 1.5.0+cu101


## Math Dataset Manager

This class is just a Numpy/Pytorch helper to manage all files in Math Dataset v1.0 and select different parts of it by categories or modules to generate a Pytorch dataset for training. Pytorch Datasets created doesn't mount all questions/answers in memory and use Pandas limited streaming features to bufferize data. It allows loading huge files quite fast while keeping memory print reasonable. It also caches lazy datasets and allows fast re-using previously created ones.

Here are the main features provided right now.

### Initialize Math Dataset Manager

In [2]:
mdsmgr = MathDatasetManager(
  "C:\\Users\\Jesús\\Documents\\PC2\\TorchDemo\\hs-math-nlp\\mathematics_dataset-v1.0\\mathematics_dataset-v1.0\\"
)

initialized MultiFilesMathDataset with categories ['algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability'] and types ['train-easy', 'train-medium', 'train-hard', 'interpolate', 'extrapolate']


### Check availables types (difficulties + interpolate + extrapolate)

In [3]:
print("types", list(mdsmgr.get_types()))

types ['train-easy', 'train-medium', 'train-hard', 'interpolate', 'extrapolate']


### Check availables problem categories

In [4]:
print("categories", list(mdsmgr.get_categories()))

categories ['algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability']


## Pytorch Initialization

In [4]:
seed = 1
torch.manual_seed(seed)
device = torch.device("cuda")
print("device", device)

device cuda


## Train on Algebra Linear_1d in Easy mode

### Create an experiment with a name and a unique ID

In [5]:
exp_name = "linear_algebra_2045" # "math_ds_algebra_linear_1d_easy"
unique_id = "2021-07-34" # "2019-05-25_0900"

### Build Dataset for training

#### Train-easy dataset

In [6]:
ds = mdsmgr.build_dataset_from_module(
    'algebra', 'linear_1d', 'train-easy', 1000
)
print("train-easy dataset size", len(ds))

train-easy dataset size 1000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


#### Interpolate dataset

In [7]:
ds_interpolate = mdsmgr.build_dataset_from_module(
    'algebra', 'linear_1d', 'interpolate', 1000
)
print("interpolate dataset size", len(ds_interpolate))

interpolate dataset size 1000


### Create default Transformer model

Here we test the best model found in the paper: a multi-head self-attention transformer to give a default sample.


In [8]:
model = utils.build_transformer()

### Create basic optimizer

In [10]:
optimizer = optim.Adam(model.parameters(), lr=6e-6, betas=(0.9, 0.995), eps=1e-9)

### Create Pytorch dataloaders

In [11]:
# here we split data in 90/10% for train/validation and use interpolate for test
train_ds, val_ds = math_dataset.random_split_dataset(ds, split_rate=0.9)

# we provide the function question_answer_to_position_batch_collate_fn that collates
# all questions/answers into transformer format enhanced with char positioning
train_loader = data.DataLoader(
    train_ds, batch_size=128, shuffle=True, num_workers=8,
    collate_fn=question_answer_to_position_batch_collate_fn)

val_loader = data.DataLoader(
    val_ds, batch_size=128, shuffle=False, num_workers=8,
    collate_fn=question_answer_to_position_batch_collate_fn)

interpolate_loader = data.DataLoader(
    ds_interpolate, batch_size=128, shuffle=False, num_workers=8,
    collate_fn=question_answer_to_position_batch_collate_fn)


In [12]:
model = model.to(device)


In [13]:
model_process.train(
    name = exp_name +"-" + unique_id,
    model = model,
    training_data= train_loader,
    validation_data = val_loader,
    interpolate_data=interpolate_loader,
    optimizer = optimizer,
    device = device,
    epochs=8,
    tb=None,
    log_interval=100)

~~~ Beginning Training ~~~~
Start epoch: 0, Start batch: 0, Max batch: None
[ Epoch: 0 / 8, Run Batch: 0 / None]


KeyboardInterrupt: 

### Plotting Training from Tensorboard data

#### Restore best model for this experience

In [9]:
import checkpoints


# build default transformer model
model = utils.build_transformer()

#model_exp_name = "linear_algebra" # "math_ds_algebra_linear_1d_easy"
#model_unique_id  = "2020-07-22" # "2019-05-25_0900"
#model_exp_name = 'math_ds_algebra_linear_1d_easy'
#model_unique_id = '2019-10-27_2300'
# restore best validation model from checkpoint
_ = checkpoints.restore_checkpoint(".\\checkpoints\\linear_algebra_2045-2021-07-34_4688_training_best.pth","", model=model)


Attempting to extract state from .\checkpoints\linear_algebra_2045-2021-07-34_4688_training_best.pth...
Loading model state_dict from state found in .\checkpoints\linear_algebra_2045-2021-07-34_4688_training_best.pth


#### Loading tensorboard events

> As we can see, loss per char on validation dataset has a nice optimization curve but for interpolate, it's not the case. It's quite normal, interpolate contains more difficult and general cases. 

#### Accuracy Evolution during training

In [16]:
plt.rcParams['figure.figsize'] = [10, 6]

fig, ax = plt.subplots()

ax.plot(
    list(map(lambda l: l.step, valid_accuracy)),
    list(map(lambda l: l.value, valid_accuracy)),
    marker='+', label='Validation Accuracy'
)
ax.plot(
    list(map(lambda l: l.step, interpolate_accuracy)),
    list(map(lambda l: l.value, interpolate_accuracy)),
    marker='+', label='Interpolate Accuracy'
)
plt.title('Algebra/Linear_1d Accuracy')
ax.legend(loc='upper left', frameon=False)
plt.xticks(np.arange(0, 20, step=1.0))
plt.yticks(np.arange(0.3, 1.0, step=0.1))
plt.show()


NameError: name 'plt' is not defined

> Accuracy for validation dataset is growing constantly up to 85% while for interpolate dataset, it doesn't change much. Interpolate dataset contains too complicated and generic problems compared to training set.

### Test Model

In [17]:
model_process.predict_single("Solve 5*w + 3 = -2 for w.", model, device, n_best=1)


[{'resp': '-1', 'score': 0.0}]

In [18]:
model_process.predict_single("Solve 212 = 56*z + 44 for z.", model, device, n_best=1)


[{'resp': '1', 'score': -0.130828857421875}]

In [19]:
model_process.predict_single("Solve 2514*m = 2508*m - 24 for m.", model, device, n_best=1)

[{'resp': '-4', 'score': -0.0047149658203125}]

In [10]:
model_process.predict_single("Solve 5*w + 3 = -2 for w.", model, device, n_best=1)




[{'resp': '-1', 'score': 0.0}]

In [13]:
model_process.predict_single("Solve 212 = 56*z + 44 for z.", model, device, n_best=1)


[{'resp': '1', 'score': -0.130828857421875}]

In [12]:
model_process.predict_single("Solve 2514*m = 2508*m - 24 for m.", model, device, n_best=1)

[{'resp': '-4', 'score': -0.0047149658203125}]

In [14]:
model_process.predict_single("Solve 36*u - 20 = 52 for u.", model, device, n_best=1)



[{'resp': '2', 'score': 0.0}]

In [15]:
model_process.predict_single("Solve 36*a - 20 = 52 for a.", model, device, n_best=1)


[{'resp': '2', 'score': 0.0}]

In [16]:
model_process.predict_single("Find 36*x - 20 = 52 for x.", model, device, n_best=1)


[{'resp': '5', 'score': -0.0032958984375}]

In [22]:
model_process.predict_single("Solve x - 0 = 0  for x.", model, device, n_best=1)


[{'resp': '0', 'score': 0.0}]

In [23]:
model_process.predict_single("Solve 14*w - 85*w + 17*w + 20*w + 272 = 0 for w.", model, device, n_best=1)



[{'resp': '-8', 'score': -0.19140625}]

In [None]:
#Solve 91*b - 4030 - 1453 = -188*b + 1492 for b.