# Environment

In [1]:
# Create a directory named 'nmt'
!mkdir nmt

# Change the current working directory to the newly created 'nmt' directory
%cd nmt

# Create a subdirectory named 'nmtmodel' inside the 'nmt' directory
!mkdir nmtmodel


/content/nmt


In [2]:
# Install OpenNMT-py along with specific versions of torchvision and torchaudio
! pip install OpenNMT-py torchvision==0.14.1 torchaudio==0.13.1 > /dev/null


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [3]:
# Print the current working directory
!pwd

/content/nmt


# Prepare Dataset

In [4]:
# Copy the file 'lcquad.zip' from '/content/drive/MyDrive/' to the current directory
!cp /content/drive/MyDrive/lcquad.zip ./

# Unzip the file 'lcquad.zip' and extract its contents to the current directory
!unzip ./lcquad.zip -d ./

# Remove the original zip file 'lcquad.zip' from '/content/nmt/'
!rm /content/nmt/lcquad.zip

Archive:  ./lcquad.zip
   creating: ./lcquad/
  inflating: ./__MACOSX/._lcquad     
  inflating: ./lcquad/dev.en         
  inflating: ./__MACOSX/lcquad/._dev.en  
  inflating: ./lcquad/dev.sparql     
  inflating: ./__MACOSX/lcquad/._dev.sparql  
  inflating: ./lcquad/.DS_Store      
  inflating: ./__MACOSX/lcquad/._.DS_Store  
  inflating: ./lcquad/train.sparql   
  inflating: ./__MACOSX/lcquad/._train.sparql  
  inflating: ./lcquad/train.en       
  inflating: ./__MACOSX/lcquad/._train.en  
  inflating: ./lcquad/test.sparql    
  inflating: ./__MACOSX/lcquad/._test.sparql  
  inflating: ./lcquad/test.en        
  inflating: ./__MACOSX/lcquad/._test.en  


In [5]:
# List the contents of the current directory
!ls


lcquad	__MACOSX  nmtmodel


# Create the Training Configuration File

In [6]:
# Define the root directory path for the NMT model as "model_root"
model_root = '/content/nmt/nmtmodel'

# Create the directory structure for the NMT model using the mkdir command with the -p option
# This ensures that the entire directory path is created, including any necessary parent directories
!mkdir -p '{model_root}'


In [7]:
# Define the configuration as a formatted string
config = f'''# config.yaml
## Where the samples will be written
save_data: {model_root}

## Where the vocab(s) will be written
# Vocabulary files, generated by onmt_build_vocab
src_vocab: {model_root}/src.vocab
tgt_vocab: {model_root}/src.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 11000
tgt_vocab_size: 11000
share_vocab: true

# Training files
data:
    train:
        path_src: /content/nmt/lcquad/train.en
        path_tgt: /content/nmt/lcquad/train.sparql
    valid:
        path_src: /content/nmt/lcquad/dev.en
        path_tgt: /content/nmt/lcquad/dev.sparql

# Where to save the checkpoints
save_model: {model_root}/model
log_file: {model_root}/train.log
save_checkpoint_steps: 100
train_steps: 1200
valid_steps: 400

# Stop training if it does not imporve after n validations
early_stopping: 4

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 4242

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
# queue_size: 100
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
# world_size: 1
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
# model_dtype: "fp16"
optim: "adam"
# learning_rate: 2
warmup_steps: 500
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.98
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
# dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

# Write the configuration to a "config.yaml" file
with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

# Build Vocabulary

In [8]:
import os

# Check if the source vocabulary file doesn't exist in the 'model_root' directory
if not os.path.exists(os.path.join(model_root, 'src.vocab')):
    # Build the source vocabulary using the onmt_build_vocab command and the provided config.yaml
    # The --n_sample option is used to indicate the number of samples to consider for building the vocabulary
    # The "|| true" at the end ensures that the command won't stop the script even if there's an error
    !onmt_build_vocab -config config.yaml --n_sample -1 || true


Corpus train's weight should be given. We default it to 1 for you.
[2023-07-09 17:31:46,273 INFO] Counter vocab from -1 samples.
[2023-07-09 17:31:46,273 INFO] n_sample=-1: Build vocab on full datasets.
[2023-07-09 17:31:46,451 INFO] Counters src: 6405
[2023-07-09 17:31:46,451 INFO] Counters tgt: 4412
[2023-07-09 17:31:46,452 INFO] Counters after share:10760


# Check GPU

In [9]:
# Check NVIDIA GPU information using the 'nvidia-smi' command
!nvidia-smi

# Print a separator line and heading for the GPU information
print('\n\n$*****************************************************************************$')
print('GPU:')

# Check and display the GPU devices using 'nvidia-smi -L'
!nvidia-smi -L

# Print a separator line
print('$*****************************************************************************$')

# Print a separator line and heading for GPU-related checks
print('\n\n$*****************************************************************************$')

# Check if CUDA-enabled GPU is available for PyTorch
import torch
print(torch.cuda.is_available())

# Print the name of the first CUDA-enabled GPU
print(torch.cuda.get_device_name(0))

# Get GPU memory information
gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

# Print a separator line
print('$*****************************************************************************$')

# Print system information using 'lsb_release -a'
!lsb_release -a

# Print a separator line
print('$*****************************************************************************$')

# Print kernel version using 'uname -r'
!uname -r

# Print a separator line
print('$*****************************************************************************$')

# Print NVCC (NVIDIA CUDA Compiler) version using 'nvcc --version'
!nvcc --version

# Print a separator line
print('$*****************************************************************************$')

# Check Torch version using Python import
import torch
print(torch.__version__)

# Print a separator line
print('$*****************************************************************************$')

# Display CPU information using 'cat /proc/cpuinfo | grep model\ name'
!cat /proc/cpuinfo | grep model\ name

# Print a separator line
print('$*****************************************************************************$')

# Display total memory information using 'cat /proc/meminfo | grep MemTotal'
!cat /proc/meminfo | grep MemTotal


Sun Jul  9 17:31:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training

In [10]:
# Train the NMT model using the configuration defined in 'config.yaml'
!onmt_train -config config.yaml


[2023-07-09 17:31:54,023 INFO] Missing transforms field for train data, set to default: [].
[2023-07-09 17:31:54,023 INFO] Missing transforms field for valid data, set to default: [].
[2023-07-09 17:31:54,023 INFO] Parsed 2 corpora from -data.
[2023-07-09 17:31:54,024 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-07-09 17:31:54,056 INFO] The first 10 tokens of the vocabs are:['<unk>', '<blank>', '<s>', '</s>', 'var_uri', 'sep_dot', 'WHERE', 'brack_open', 'brack_close', 'the']
[2023-07-09 17:31:54,056 INFO] The decoder start token is: <s>
[2023-07-09 17:31:54,056 INFO] Building model...
[2023-07-09 17:31:54,690 INFO] Switching model to float32 for amp/apex_amp
[2023-07-09 17:31:54,691 INFO] Non quantized layer compute is fp32
[2023-07-09 17:31:55,676 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(10768, 512, padding_idx=1)
        )
    

In [12]:
# List the contents of the 'model_root' directory
!ls '{model_root}'


model_step_1000.pt  model_step_200.pt  model_step_600.pt  src.vocab
model_step_100.pt   model_step_300.pt  model_step_700.pt  train.log
model_step_1100.pt  model_step_400.pt  model_step_800.pt
model_step_1200.pt  model_step_500.pt  model_step_900.pt


# Translate

In [11]:
# Perform translation using the trained NMT model
# --model specifies the path to the trained model checkpoint
# --src specifies the path to the source text file to be translated
# --output specifies the path to save the translated output
# -beam_size specifies the beam size for beam search
! onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src /content/nmt/lcquad/test.en --output /content/nmt/lcquad/trans_test.sparql -beam_size 4

[2023-07-09 18:06:24,314 INFO] Loading checkpoint from /content/nmt/nmtmodel/model_step_1200.pt
[2023-07-09 18:06:25,023 INFO] Loading data into the model
[2023-07-09 18:07:29,066 INFO] PRED SCORE: -0.3570, PRED PPL: 1.43 NB SENTENCES: 500


In [13]:
# Display the first 5 lines of the file 'test.en'
!head -n 5 /content/nmt/lcquad/test.en

list the products of the company which published tweenies: game time .
name the common sports played at polytechnic university of philippines san juan and islamic azad university ?
which company developed both dart and go ?
count the total number of launch site of the rockets which have been launched form cape canaveral air force station ?
to which settlement does elliot bay belong to ?


In [14]:
# Display the first 5 lines of the file 'trans_test.sparql'
!head -n 5 /content/nmt/lcquad/trans_test.sparql

SELECT DISTINCT var_uri WHERE brack_open var_x <dbo_product> <dbr_Postbanken> sep_dot var_x <dbp_products> var_uri sep_dot var_x <rdf_type> <dbo_Company> brack_close
SELECT DISTINCT var_uri WHERE brack_open <dbr_Advocate_Nasiruddin> <dbo_knownFor> var_uri sep_dot <dbr_Polytechnic_University_of_the_Philippines_San_Juan> <dbo_sport> var_uri brack_close
SELECT DISTINCT var_uri WHERE brack_open <dbr_Dart_ attr_open programming_language attr_close math_gt <dbp_developer> var_uri sep_dot brack_close
SELECT DISTINCT COUNT attr_open var_uri attr_close WHERE brack_open var_x <dbo_launchSite> <dbr_Cape_Canaveral_Air_Force_Station> sep_dot var_x <dbo_manufacturer> var_uri sep_dot brack_close
SELECT DISTINCT var_uri WHERE brack_open <dbr_Clark_Daniel_Stearns> <dbo_militaryBranch> var_uri brack_close


In [15]:
# Display the first 5 lines of the file 'test.sparql'
!head -n 5 /content/nmt/lcquad/test.sparql

SELECT DISTINCT var_uri WHERE brack_open <dbr_Tweenies:_Game_Time> <dbp_publisher> var_x sep_dot var_x <dbp_products> var_uri sep_dot brack_close
SELECT DISTINCT var_uri WHERE brack_open <dbr_Polytechnic_University_of_the_Philippines_San_Juan> <dbo_sport> var_uri sep_dot <dbr_Islamic_Azad_University_Central_Tehran_Branch> <dbo_sport> var_uri sep_dot brack_close
SELECT DISTINCT var_uri WHERE brack_open <dbr_Dart_ attr_open programming_language attr_close math_gt <dbo_developer> var_uri sep_dot <dbr_Go_ attr_open programming_language attr_close math_gt <dbo_developer> var_uri sep_dot brack_close
SELECT DISTINCT COUNT attr_open var_uri attr_close WHERE brack_open var_x <dbo_launchSite> <dbr_Cape_Canaveral_Air_Force_Station> sep_dot var_x <dbo_launchSite> var_uri brack_close
SELECT DISTINCT var_uri WHERE brack_open <dbr_Elliott_Bay> <dbp_cities> var_uri brack_close


# Evaluate

In [16]:
# Print the current working directory
!pwd

/content/nmt


In [17]:
# Copy the file "compute-accuracy.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-accuracy.py ./

# Evaluate the translation using the provided accuracy computation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-accuracy.py /content/nmt/lcquad/test.sparql /content/nmt/lcquad/trans_test.sparql


Reference 1st sentence: SELECT DISTINCT var_uri WHERE brack_open <dbr_Tweenies:_Game_Time> <dbp_publisher> var_x sep_dot var_x <dbp_products> var_uri sep_dot brack_close
MTed 1st sentence: SELECT DISTINCT var_uri WHERE brack_open var_x <dbo_product> <dbr_Postbanken> sep_dot var_x <dbp_products> var_uri sep_dot var_x <rdf_type> <dbo_Company> brack_close
Accuracy:  0.63043784316486


In [18]:
# Install the sacrebleu library using pip
!pip install sacrebleu > /dev/null

# Copy the file "compute-bleu.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-bleu.py ./

# Evaluate the translation using BLEU score calculation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-bleu.py /content/nmt/lcquad/test.sparql /content/nmt/lcquad/trans_test.sparql


Reference 1st sentence: SELECT DISTINCT var_uri WHERE brack_open <dbr_Tweenies:_Game_Time> <dbp_publisher> var_x sep_dot var_x <dbp_products> var_uri sep_dot brack_close
MTed 1st sentence: SELECT DISTINCT var_uri WHERE brack_open var_x <dbo_product> <dbr_Postbanken> sep_dot var_x <dbp_products> var_uri sep_dot var_x <rdf_type> <dbo_Company> brack_close
BLEU:  67.0482631851078


In [19]:
# Install the rouge library using pip
!pip install rouge > /dev/null

# Copy the file "compute-rouge-l.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-rouge-l.py ./

# Evaluate the translation using Rouge-L score calculation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-rouge-l.py /content/nmt/lcquad/test.sparql /content/nmt/lcquad/trans_test.sparql


Reference 1st sentence: SELECT DISTINCT var_uri WHERE brack_open <dbr_Tweenies:_Game_Time> <dbp_publisher> var_x sep_dot var_x <dbp_products> var_uri sep_dot brack_close
MTed 1st sentence: SELECT DISTINCT var_uri WHERE brack_open var_x <dbo_product> <dbr_Postbanken> sep_dot var_x <dbp_products> var_uri sep_dot var_x <rdf_type> <dbo_Company> brack_close
Rouge-L:  0.7283995887106266


In [20]:
# Copy the directory 'nmt' and its contents from '/content/nmt' to '/content/drive/MyDrive'
!cp -r /content/nmt /content/drive/MyDrive


# Test

In [None]:
sentences = [
    ''' ''',
    ''' '''
]
with open('questions.en', 'w') as fp:
    t = [''.join(x) for x in sentences]
    t = '\n'.join(t)
    fp.write(t)

In [None]:
! onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src questions.en --output pred.sparql

In [None]:
! cat pred.sparql