# Environment

In [1]:
# Create a directory named 'nmt'
!mkdir nmt

# Change the current working directory to the 'nmt' directory
%cd nmt

# Inside the 'nmt' directory, create a subdirectory named 'nmtmodel'
!mkdir nmtmodel


/content/nmt


In [2]:
# Install the specified versions of OpenNMT-py, torchvision, and torchaudio using pip
! pip install OpenNMT-py torchvision==0.14.1 torchaudio==0.13.1 > /dev/null


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [3]:
# Print the current working directory
!pwd


/content/nmt


# Prepare Dataset

In [4]:
# Copy the file "monument.zip" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/monument.zip ./

# Unzip the file "monument.zip" and extract its contents to the current directory
# -d specifies the target directory for extraction
!unzip ./monument.zip -d ./

# Remove the original zip file "monument.zip" from the '/content/nmt/' directory
!rm /content/nmt/monument.zip


Archive:  ./monument.zip
   creating: ./monument/
  inflating: ./__MACOSX/._monument   
  inflating: ./monument/dev.en       
  inflating: ./__MACOSX/monument/._dev.en  
  inflating: ./monument/dev.sparql   
  inflating: ./__MACOSX/monument/._dev.sparql  
  inflating: ./monument/.DS_Store    
  inflating: ./__MACOSX/monument/._.DS_Store  
  inflating: ./monument/train.sparql  
  inflating: ./__MACOSX/monument/._train.sparql  
  inflating: ./monument/train.en     
  inflating: ./__MACOSX/monument/._train.en  
  inflating: ./monument/test.sparql  
  inflating: ./__MACOSX/monument/._test.sparql  
  inflating: ./monument/test.en      
  inflating: ./__MACOSX/monument/._test.en  


In [None]:
# List the contents of the current directory
!ls


__MACOSX  monument600-dataset  nmtmodel


# Create the Training Configuration File

In [5]:
# Define the path to the model root directory
model_root = '/content/nmt/nmtmodel'

# Create the model root directory and any necessary parent directories using 'mkdir -p'
!mkdir -p '{model_root}'


In [6]:
# Define the content for the configuration file 'config.yaml'
config = f'''# config.yaml
## Where the samples will be written
save_data: {model_root}

## Where the vocab(s) will be written
# Vocabulary files, generated by onmt_build_vocab
src_vocab: {model_root}/src.vocab
tgt_vocab: {model_root}/src.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 5000
tgt_vocab_size: 5000
share_vocab: true

# Training files
data:
    train:
        path_src: /content/nmt/monument/train.en
        path_tgt: /content/nmt/monument/train.sparql
    valid:
        path_src: /content/nmt/monument/dev.en
        path_tgt: /content/nmt/monument/dev.sparql

# Where to save the checkpoints
save_model: {model_root}/model
log_file: {model_root}/train.log
save_checkpoint_steps: 100
train_steps: 1200
valid_steps: 400

# Stop training if it does not imporve after n validations
early_stopping: 4

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 4242

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
# queue_size: 100
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
# world_size: 1
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
# model_dtype: "fp16"
optim: "adam"
# learning_rate: 2
warmup_steps: 500
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.98
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
# dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

# Write the configuration content to the 'config.yaml' file
with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

# Build Vocabulary

In [7]:
# Import the 'os' module for interacting with the operating system
import os

# Check if the source vocabulary file 'src.vocab' doesn't exist in the model root directory
if not os.path.exists(os.path.join(model_root, 'src.vocab')):
    # Build the source vocabulary using the onmt_build_vocab command with the specified configuration file
    # --n_sample -1: Sample the entire training dataset
    # The '|| true' at the end ensures that the command continues even if it encounters an error
    !onmt_build_vocab -config config.yaml --n_sample -1 || true


Corpus train's weight should be given. We default it to 1 for you.
[2023-07-09 15:33:20,706 INFO] Counter vocab from -1 samples.
[2023-07-09 15:33:20,706 INFO] n_sample=-1: Build vocab on full datasets.
[2023-07-09 15:33:20,987 INFO] Counters src: 2569
[2023-07-09 15:33:20,987 INFO] Counters tgt: 2058
[2023-07-09 15:33:20,988 INFO] Counters after share:4609


# Check GPU

In [8]:
# Display GPU information using the nvidia-smi command
!nvidia-smi

# Print a separator line for clarity
print('\n\n$*****************************************************************************$')

# Print "GPU:" to indicate GPU-related information
print('GPU:')

# Display a list of GPUs using the nvidia-smi command with the -L flag
!nvidia-smi -L

# Print a separator line for clarity
print('$*****************************************************************************$')

# Print a separator line for clarity
print('\n\n$*****************************************************************************$')

# Check if the GPU is visible and available for PyTorch
import torch

# Check if CUDA (GPU support for PyTorch) is available
print(torch.cuda.is_available())

# Get the name of the GPU device at index 0
print(torch.cuda.get_device_name(0))

# Get GPU memory information using torch.cuda.mem_get_info()
gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

# Print a separator line for clarity
print('$*****************************************************************************$')

# Display Linux distribution information using the lsb_release command
!lsb_release -a

# Print a separator line for clarity
print('$*****************************************************************************$')

# Display the Linux kernel version using the uname -r command
!uname -r

# Print a separator line for clarity
print('$*****************************************************************************$')

# Display the CUDA compiler version using the nvcc --version command
!nvcc --version

# Print a separator line for clarity
print('$*****************************************************************************$')

# Display the version of PyTorch using the torch.__version__ attribute
import torch
print(torch.__version__)

# Print a separator line for clarity
print('$*****************************************************************************$')

# Display CPU information by searching for the "model name" in /proc/cpuinfo
!cat /proc/cpuinfo | grep model\ name

# Print a separator line for clarity
print('$*****************************************************************************$')

# Display total memory information using the meminfo file in /proc
!cat /proc/meminfo | grep MemTotal


Sun Jul  9 15:33:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training

In [9]:
# Train the neural machine translation (NMT) model using the specified configuration file
!onmt_train -config config.yaml


[2023-07-09 15:33:24,830 INFO] Missing transforms field for train data, set to default: [].
[2023-07-09 15:33:24,830 INFO] Missing transforms field for valid data, set to default: [].
[2023-07-09 15:33:24,830 INFO] Parsed 2 corpora from -data.
[2023-07-09 15:33:24,831 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-07-09 15:33:24,856 INFO] The first 10 tokens of the vocabs are:['<unk>', '<blank>', '<s>', '</s>', 'var_a', 'where', 'brack_open', 'brack_close', 'select', 'is']
[2023-07-09 15:33:24,856 INFO] The decoder start token is: <s>
[2023-07-09 15:33:24,856 INFO] Building model...
[2023-07-09 15:33:25,627 INFO] Switching model to float32 for amp/apex_amp
[2023-07-09 15:33:25,627 INFO] Non quantized layer compute is fp32
[2023-07-09 15:33:26,607 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(4616, 512, padding_idx=1)
        )
        (

In [10]:
# List the contents of the directory specified by 'model_root'
!ls '{model_root}'

model_step_1200.pt  model_step_600.pt  src.vocab
model_step_300.pt   model_step_900.pt  train.log


# Translate

In [11]:
# Use the trained NMT model to perform translation on the test data
# --model: Path to the trained model checkpoint
# --src: Path to the source (input) file for translation
# --output: Path to save the translated output
# -beam_size: Beam size for beam search decoding
!onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src /content/nmt/monument/test.en --output /content/nmt/monument/trans_test.sparql -beam_size 4


[2023-07-09 16:02:41,282 INFO] Loading checkpoint from /content/nmt/nmtmodel/model_step_1200.pt
[2023-07-09 16:02:41,839 INFO] Loading data into the model
[2023-07-09 16:02:52,859 INFO] PRED SCORE: -0.0955, PRED PPL: 1.10 NB SENTENCES: 100


In [12]:
# Display the first 5 lines of the 'test.en' file using the 'head' command
!head -n 5 /content/nmt/monument/test.en


which is longer los angeles police department memorial for fallen officers or national war memorial
how many monument does böyük tağlar have
location of mint clock tower
how many place does foshan have
is ramagrama stupa a monument


In [13]:
# Display the first 5 lines of the 'trans_test.sparql' file using the 'head' command
!head -n 5 /content/nmt/monument/trans_test.sparql


select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close brack_close order by var_b limit 1
select count par_open wildcard par_close where brack_open var_a rdf_type dbo_Monument sep_dot var_a dbo_location dbr_Böyük_Tağlar brack_close group by var_a
select var_a where brack_open dbr_Mint_Clock_Tower,_Chennai dbo_location var_a brack_close
select count par_open wildcard par_close where brack_open var_a rdf_type dbo_Place sep_dot var_a dbo_location dbr_Foshan brack_close group by var_a
ask where brack_open dbr_Ramagrama_stupa rdf_type dbo_Monument brack_close


In [14]:
# Display the first 5 lines of the 'test.sparql' file using the 'head' command
!head -n 5 /content/nmt/monument/test.sparql


select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
select count par_open wildcard par_close  where brack_open var_a rdf_type dbo_Monument sep_dot var_a dbo_location dbr_Böyük_Tağlar brack_close group by var_a
select var_a where brack_open dbr_Mint_Clock_Tower,_Chennai dbo_location var_a brack_close
select count par_open wildcard par_close  where brack_open var_a rdf_type dbo_Place sep_dot var_a dbo_location dbr_Foshan brack_close group by var_a
ask where brack_open dbr_Ramagrama_stupa rdf_type dbo_Monument brack_close


# Evaluate

In [15]:
# Print the current working directory (current path)
!pwd


/content/nmt


In [16]:
# Copy the 'compute-accuracy.py' script from the Google Drive to the current directory
!cp /content/drive/MyDrive/compute-accuracy.py ./

# Evaluate the translation quality using accuracy
# - The script 'compute-accuracy.py' is used to compare the reference translations with the generated translations and compute accuracy.
# - It takes the paths of the reference and generated translation files as command-line arguments.
!python compute-accuracy.py /content/nmt/monument/test.sparql /content/nmt/monument/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
MTed 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close brack_close order by var_b limit 1
Accuracy:  0.9547657512116317


In [17]:
# Install the 'sacrebleu' library using pip (with output redirection to /dev/null to suppress output)
!pip install sacrebleu > /dev/null

# Copy the 'compute-bleu.py' script from Google Drive to the current directory
!cp /content/drive/MyDrive/compute-bleu.py ./

# Evaluate the translation quality using BLEU
# - The script 'compute-bleu.py' is used to compute the BLEU score by comparing the reference translations with the generated translations.
# - It takes the paths of the reference and generated translation files as command-line arguments.
!python compute-bleu.py /content/nmt/monument/test.sparql /content/nmt/monument/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
MTed 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close brack_close order by var_b limit 1
BLEU:  98.09234898730233


In [18]:
# Install the 'rouge' library using pip (with output redirection to /dev/null to suppress output)
!pip install rouge > /dev/null

# Copy the 'compute-rouge-l.py' script from Google Drive to the current directory
!cp /content/drive/MyDrive/compute-rouge-l.py ./

# Evaluate the translation quality using Rouge-L
# - The script 'compute-rouge-l.py' is used to compute the Rouge-L score by comparing the reference translations with the generated translations.
# - It takes the paths of the reference and generated translation files as command-line arguments.
!python compute-rouge-l.py /content/nmt/monument/test.sparql /content/nmt/monument/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
MTed 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close brack_close order by var_b limit 1
Rouge-L:  0.9907949980610772


In [20]:
# Copy the trained NMT model directory 'nmtmodel' to a specific directory in Google Drive
!cp -r /content/nmt/nmtmodel /content/drive/MyDrive/NMT_models


# Test

In [None]:
sentences = [
    ''' ''',
    ''' '''
]
with open('questions.en', 'w') as fp:
    t = [''.join(x) for x in sentences]
    t = '\n'.join(t)
    fp.write(t)

In [None]:
! onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src questions.en --output pred.sparql

In [None]:
! cat pred.sparql