# Environment

In [1]:
# Create a directory named 'nmt'
!mkdir nmt

# Change the current working directory to the newly created 'nmt' directory
%cd nmt

# Inside the 'nmt' directory, create another directory named 'nmtmodel'
!mkdir nmtmodel


/content/nmt


In [2]:
# Install specific versions of OpenNMT-py, torchvision, and torchaudio using pip
# The output of the installation process is redirected to /dev/null to suppress output
!pip install OpenNMT-py torchvision==0.14.1 torchaudio==0.13.1 > /dev/null


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [3]:
# Print the current working directory (current path)
!pwd


/content/nmt


# Prepare Dataset

In [4]:
# Copy the 'monument50.zip' file from Google Drive to the current directory
!cp /content/drive/MyDrive/monument50.zip ./

# Unzip the contents of the 'monument50.zip' file into the current directory
# The '-d' flag specifies the destination directory for the extracted files
!unzip ./monument50.zip -d ./

# Remove the original 'monument50.zip' file from the 'nmt' directory
!rm /content/nmt/monument50.zip

Archive:  ./monument50.zip
   creating: ./monument50/
  inflating: ./__MACOSX/._monument50  
  inflating: ./monument50/dev.en     
  inflating: ./__MACOSX/monument50/._dev.en  
  inflating: ./monument50/dev.sparql  
  inflating: ./__MACOSX/monument50/._dev.sparql  
  inflating: ./monument50/.DS_Store  
  inflating: ./__MACOSX/monument50/._.DS_Store  
  inflating: ./monument50/train.sparql  
  inflating: ./__MACOSX/monument50/._train.sparql  
  inflating: ./monument50/train.en   
  inflating: ./__MACOSX/monument50/._train.en  
  inflating: ./monument50/test.sparql  
  inflating: ./__MACOSX/monument50/._test.sparql  
  inflating: ./monument50/test.en    
  inflating: ./__MACOSX/monument50/._test.en  


In [None]:
# List the contents of the current directory using the 'ls' command
!ls


# Create the Training Configuration File

In [6]:
# Define the path for the model root directory
model_root = '/content/nmt/nmtmodel'

# Create the model root directory if it doesn't exist
# The '-p' flag ensures that intermediate directories are also created if needed
!mkdir -p '{model_root}'


In [7]:
# Define the content of the configuration file as a formatted string
config = f'''# config.yaml
## Where the samples will be written
save_data: {model_root}

## Where the vocab(s) will be written
# Vocabulary files, generated by onmt_build_vocab
src_vocab: {model_root}/src.vocab
tgt_vocab: {model_root}/src.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 5000
tgt_vocab_size: 5000
share_vocab: true

# Training files
data:
    train:
        path_src: /content/nmt/monument50/train.en
        path_tgt: /content/nmt/monument50/train.sparql
    valid:
        path_src: /content/nmt/monument50/dev.en
        path_tgt: /content/nmt/monument50/dev.sparql

# Where to save the checkpoints
save_model: {model_root}/model
log_file: {model_root}/train.log
save_checkpoint_steps: 100
train_steps: 1200
valid_steps: 400

# Stop training if it does not imporve after n validations
early_stopping: 4

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 4242

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
# queue_size: 100
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
# world_size: 1
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
# model_dtype: "fp16"
optim: "adam"
# learning_rate: 2
warmup_steps: 500
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.98
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
# dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

# Write the configuration string to a file named "config.yaml"
with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

# Build Vocabulary

In [8]:
# Import the 'os' module to interact with the operating system
import os

# Check if the source vocabulary file does not exist in the model root directory
if not os.path.exists(os.path.join(model_root, 'src.vocab')):
    # Use the 'onmt_build_vocab' command to generate vocabulary files
    # The '-config' flag specifies the configuration file
    # The '--n_sample -1' flag ensures that all training data samples are used
    # The '|| true' part ensures that the command continues running even if it encounters an error
    ! onmt_build_vocab -config config.yaml --n_sample -1 || true


Corpus train's weight should be given. We default it to 1 for you.
[2023-07-09 16:07:53,437 INFO] Counter vocab from -1 samples.
[2023-07-09 16:07:53,437 INFO] n_sample=-1: Build vocab on full datasets.
[2023-07-09 16:07:53,725 INFO] Counters src: 2035
[2023-07-09 16:07:53,725 INFO] Counters tgt: 1635
[2023-07-09 16:07:53,726 INFO] Counters after share:3657


# Check GPU

In [9]:
# Display GPU information using the 'nvidia-smi' command
!nvidia-smi

# Print a separator for better readability
print('\n\n$*****************************************************************************$')

# Display information about the available GPUs using the 'nvidia-smi -L' command
print('GPU:')
!nvidia-smi -L

# Print a separator for better readability
print('$*****************************************************************************$')

# Check if the GPU is available for PyTorch by importing the 'torch' module
import torch

# Print whether CUDA (GPU support) is available
print(torch.cuda.is_available())

# Print the name of the GPU device (if available)
print(torch.cuda.get_device_name(0))

# Get GPU memory information
gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

# Print a separator for better readability
print('$*****************************************************************************$')

# Display information about the Linux distribution using 'lsb_release -a'
!lsb_release -a

# Print a separator for better readability
print('$*****************************************************************************$')

# Display the Linux kernel version using 'uname -r'
!uname -r

# Print a separator for better readability
print('$*****************************************************************************$')

# Display the version information for the CUDA toolkit using 'nvcc --version'
!nvcc --version

# Print a separator for better readability
print('$*****************************************************************************$')

# Display the version information for PyTorch using 'torch.__version__'
import torch
print(torch.__version__)

# Print a separator for better readability
print('$*****************************************************************************$')

# Display CPU information using 'cat /proc/cpuinfo | grep model\ name'
!cat /proc/cpuinfo | grep model\ name

# Print a separator for better readability
print('$*****************************************************************************$')

# Display total system memory information using 'cat /proc/meminfo | grep MemTotal'
!cat /proc/meminfo | grep MemTotal


Sun Jul  9 16:07:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training

In [10]:
# Train the NMT model using the 'onmt_train' command and provide the configuration file 'config.yaml'
!onmt_train -config config.yaml


[2023-07-09 16:07:58,501 INFO] Missing transforms field for train data, set to default: [].
[2023-07-09 16:07:58,502 INFO] Missing transforms field for valid data, set to default: [].
[2023-07-09 16:07:58,502 INFO] Parsed 2 corpora from -data.
[2023-07-09 16:07:58,502 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-07-09 16:07:58,519 INFO] The first 10 tokens of the vocabs are:['<unk>', '<blank>', '<s>', '</s>', 'var_a', 'where', 'brack_open', 'brack_close', 'select', 'is']
[2023-07-09 16:07:58,519 INFO] The decoder start token is: <s>
[2023-07-09 16:07:58,519 INFO] Building model...
[2023-07-09 16:07:58,996 INFO] Switching model to float32 for amp/apex_amp
[2023-07-09 16:07:58,996 INFO] Non quantized layer compute is fp32
[2023-07-09 16:07:59,990 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(3664, 512, padding_idx=1)
        )
        (

In [11]:
# List the contents of the 'model_root' directory where the NMT model and related files are saved
!ls '{model_root}'


model_step_1000.pt  model_step_200.pt  model_step_600.pt  src.vocab
model_step_100.pt   model_step_300.pt  model_step_700.pt  train.log
model_step_1100.pt  model_step_400.pt  model_step_800.pt
model_step_1200.pt  model_step_500.pt  model_step_900.pt


# Translate

In [12]:
# Perform translation using the trained NMT model
# Arguments:
# --model: Path to the trained NMT model checkpoint file
# --src: Path to the source input file containing sentences to be translated
# --output: Path to the output file to save translated sentences
# -beam_size: Beam size used during translation
!onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' \
                --src /content/nmt/monument50/test.en \
                --output /content/nmt/monument50/trans_test.sparql \
                -beam_size 4


[2023-07-09 16:36:39,435 INFO] Loading checkpoint from /content/nmt/nmtmodel/model_step_1200.pt
[2023-07-09 16:36:39,982 INFO] Loading data into the model
[2023-07-09 16:49:52,704 INFO] PRED SCORE: -0.1017, PRED PPL: 1.11 NB SENTENCES: 5916


In [None]:
# Display the first 5 lines of the English test data file
# Arguments:
# -n 5: Display the first 5 lines
# /content/nmt/monument50/test.en: Path to the test data file in English
!head -n 5 /content/nmt/monument50/test.en


In [None]:
# Display the first 5 lines of the translated output file
# Arguments:
# -n 5: Display the first 5 lines
# /content/nmt/monument50/trans_test.sparql: Path to the translated output file
!head -n 5 /content/nmt/monument50/trans_test.sparql


In [None]:
# Display the first 5 lines of the reference target file
# Arguments:
# -n 5: Display the first 5 lines
# /content/nmt/monument50/test.sparql: Path to the reference target file
!head -n 5 /content/nmt/monument50/test.sparql


# Evaluate

In [16]:
!pwd

/content/nmt


In [17]:
# Copy the Compute-accuracy.py script from the Google Drive to the current directory
!cp /content/drive/MyDrive/compute-accuracy.py ./

# Evaluate the translation using accuracy
# Arguments:
# /content/nmt/monument50/test.sparql: Path to the reference target file
# /content/nmt/monument50/trans_test.sparql: Path to the translated output file
!python compute-accuracy.py /content/nmt/monument50/test.sparql /content/nmt/monument50/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a rdf_type dbr_Cristo_del_Otero sep_dot var_a dbp_height var_b brack_close order by desc par_open var_b par_close  limit 1
MTed 1st sentence: select var_a where brack_open dbr_Cristo_del_Otero dbo_abstract var_a brack_close
Accuracy:  0.9038187840012769


In [18]:
# Install the sacrebleu library using pip
!pip install sacrebleu > /dev/null

# Copy the Compute-bleu.py script from Google Drive to the current directory
!cp /content/drive/MyDrive/compute-bleu.py ./

# Evaluate the translation using BLEU score
# Arguments:
# /content/nmt/monument50/test.sparql: Path to the reference target file
# /content/nmt/monument50/trans_test.sparql: Path to the translated output file
!python compute-bleu.py /content/nmt/monument50/test.sparql /content/nmt/monument50/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a rdf_type dbr_Cristo_del_Otero sep_dot var_a dbp_height var_b brack_close order by desc par_open var_b par_close  limit 1
MTed 1st sentence: select var_a where brack_open dbr_Cristo_del_Otero dbo_abstract var_a brack_close
BLEU:  95.57343218895528


In [19]:
# Install the Rouge library using pip
!pip install rouge > /dev/null

# Copy the compute-rouge-l.py script from Google Drive to the current directory
!cp /content/drive/MyDrive/compute-rouge-l.py ./

# Evaluate the translation using Rouge-L score
# Arguments:
# /content/nmt/monument50/test.sparql: Path to the reference target file
# /content/nmt/monument50/trans_test.sparql: Path to the translated output file
!python compute-rouge-l.py /content/nmt/monument50/test.sparql /content/nmt/monument50/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a rdf_type dbr_Cristo_del_Otero sep_dot var_a dbp_height var_b brack_close order by desc par_open var_b par_close  limit 1
MTed 1st sentence: select var_a where brack_open dbr_Cristo_del_Otero dbo_abstract var_a brack_close
Rouge-L:  0.9821216708983418


In [20]:
# Copy the trained NMT model directory to Google Drive for backup
# Source: /content/nmt/nmtmodel (trained NMT model)
# Destination: /content/drive/MyDrive/NMT_models (Google Drive directory)
!cp -r /content/nmt/nmtmodel /content/drive/MyDrive/NMT_models


# Test

In [None]:
sentences = [
    ''' ''',
    ''' '''
]
with open('questions.en', 'w') as fp:
    t = [''.join(x) for x in sentences]
    t = '\n'.join(t)
    fp.write(t)

In [None]:
! onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src questions.en --output pred.sparql

In [None]:
! cat pred.sparql