# Environment

In [1]:
# Create a new directory named "nmt"
!mkdir nmt

# Change the current working directory to the newly created "nmt" directory
%cd nmt

# Create a subdirectory named "nmtmodel" within the "nmt" directory
!mkdir nmtmodel


/content/nmt


In [2]:
# Install the required Python packages
# This command uses pip to install OpenNMT-py and specific versions of torchvision and torchaudio
# The "> /dev/null" part is used to suppress the output and keep the installation process clean
!pip install OpenNMT-py torchvision==0.14.1 torchaudio==0.13.1 > /dev/null


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [3]:
!pwd

/content/nmt


# Prepare Dataset

In [4]:
# Copy the file "monument80.zip" from a specific directory on Google Drive to the current directory
!cp /content/drive/MyDrive/monument80.zip ./

# Unzip the "monument80.zip" file and extract its contents to the current directory
# The "-d" flag specifies the directory where the contents will be extracted
!unzip ./monument80.zip -d ./

# Remove the original "monument80.zip" file from the source directory
!rm /content/nmt/monument80.zip


Archive:  ./monument80.zip
   creating: ./monument80/
  inflating: ./__MACOSX/._monument80  
  inflating: ./monument80/dev.en     
  inflating: ./__MACOSX/monument80/._dev.en  
  inflating: ./monument80/dev.sparql  
  inflating: ./__MACOSX/monument80/._dev.sparql  
  inflating: ./monument80/train.sparql  
  inflating: ./__MACOSX/monument80/._train.sparql  
  inflating: ./monument80/train.en   
  inflating: ./__MACOSX/monument80/._train.en  
  inflating: ./monument80/test.sparql  
  inflating: ./__MACOSX/monument80/._test.sparql  
  inflating: ./monument80/test.en    
  inflating: ./__MACOSX/monument80/._test.en  


In [5]:
!ls

__MACOSX  monument80  nmtmodel


# GloVe

In [6]:
# Create a new directory named "glove_dir"
!mkdir "glove_dir"

# Use wget to download the "glove.6B.zip" file from the specified URL
# The "--no-check-certificate" flag is used to bypass SSL certificate verification
# This ensures that wget doesn't refuse to download due to invalid SSL certificates
!wget --no-check-certificate http://nlp.stanford.edu/data/glove.6B.zip


--2023-07-10 17:19:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-07-10 17:19:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-07-10 17:19:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [7]:
# Use the unzip command to extract the contents of "glove.6B.zip" into the "glove_dir" directory
# The "-d" flag specifies the destination directory for extraction
!unzip glove.6B.zip -d "glove_dir"

# Remove the original "glove.6B.zip" file from the "/content/nmt/" directory
# This is done using the rm (remove) command
!rm /content/nmt/glove.6B.zip


Archive:  glove.6B.zip
  inflating: glove_dir/glove.6B.50d.txt  
  inflating: glove_dir/glove.6B.100d.txt  
  inflating: glove_dir/glove.6B.200d.txt  
  inflating: glove_dir/glove.6B.300d.txt  


# Create the Training Configuration File

In [9]:
model_root = '/content/nmt/nmtmodel'

# Create a directory named 'nmtmodel' using the model_root path
!mkdir -p '{model_root}'

In [10]:
# Define the configuration as a formatted string
config = f'''# config.yaml
# GloVe:
# this means embeddings will be used for both encoder and decoder sides
both_embeddings: /content/nmt/glove_dir/glove.6B.300d.txt

# supported types: GloVe, word2vec
embeddings_type: "GloVe"

# word_vec_size need to match with the pretrained embeddings dimensions
word_vec_size: 300

## Where the samples will be written
save_data: {model_root}

## Where the vocab(s) will be written
# Vocabulary files, generated by onmt_build_vocab
src_vocab: {model_root}/src.vocab
tgt_vocab: {model_root}/src.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 5000
tgt_vocab_size: 5000
share_vocab: true

# Training files
data:
    train:
        path_src: /content/nmt/monument80/train.en
        path_tgt: /content/nmt/monument80/train.sparql
    valid:
        path_src: /content/nmt/monument80/dev.en
        path_tgt: /content/nmt/monument80/dev.sparql

# Where to save the checkpoints
save_model: {model_root}/model
log_file: {model_root}/train.log
save_checkpoint_steps: 100
train_steps: 1200
valid_steps: 400

# Stop training if it does not imporve after n validations
early_stopping: 4

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 4242

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
# queue_size: 100
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
# world_size: 1
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
# model_dtype: "fp16"
optim: "adam"
# learning_rate: 2
warmup_steps: 500 ######
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.98
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
# dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

# Write the configuration to a "config.yaml" file
with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

# Build Vocabulary

In [11]:
import os

# Check if the source vocabulary file doesn't exist in the 'model_root' directory
if not os.path.exists(os.path.join(model_root, 'src.vocab')):
    # Build the source vocabulary using the onmt_build_vocab command and the provided config.yaml
    # The --n_sample option is used to indicate the number of samples to consider for building the vocabulary
    # The "|| true" at the end ensures that the command won't stop the script even if there's an error
    !onmt_build_vocab -config config.yaml --n_sample -1 || true


Corpus train's weight should be given. We default it to 1 for you.
[2023-07-10 17:22:29,927 INFO] Counter vocab from -1 samples.
[2023-07-10 17:22:29,927 INFO] n_sample=-1: Build vocab on full datasets.
[2023-07-10 17:22:30,169 INFO] Counters src: 2388
[2023-07-10 17:22:30,169 INFO] Counters tgt: 1910
[2023-07-10 17:22:30,170 INFO] Counters after share:4282


# Check GPU

In [12]:
# Check NVIDIA GPU information using the 'nvidia-smi' command
!nvidia-smi

# Print a separator line and heading for the GPU information
print('\n\n$*****************************************************************************$')
print('GPU:')

# Check and display the GPU devices using 'nvidia-smi -L'
!nvidia-smi -L

# Print a separator line
print('$*****************************************************************************$')

# Print a separator line and heading for GPU-related checks
print('\n\n$*****************************************************************************$')

# Check if CUDA-enabled GPU is available for PyTorch
import torch
print(torch.cuda.is_available())

# Print the name of the first CUDA-enabled GPU
print(torch.cuda.get_device_name(0))

# Get GPU memory information
gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

# Print a separator line
print('$*****************************************************************************$')

# Print system information using 'lsb_release -a'
!lsb_release -a

# Print a separator line
print('$*****************************************************************************$')

# Print kernel version using 'uname -r'
!uname -r

# Print a separator line
print('$*****************************************************************************$')

# Print NVCC (NVIDIA CUDA Compiler) version using 'nvcc --version'
!nvcc --version

# Print a separator line
print('$*****************************************************************************$')

# Check Torch version using Python import
import torch
print(torch.__version__)

# Print a separator line
print('$*****************************************************************************$')

# Display CPU information using 'cat /proc/cpuinfo | grep model\ name'
!cat /proc/cpuinfo | grep model\ name

# Print a separator line
print('$*****************************************************************************$')

# Display total memory information using 'cat /proc/meminfo | grep MemTotal'
!cat /proc/meminfo | grep MemTotal


Mon Jul 10 17:22:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training

In [13]:
# Train the NMT model using the configuration defined in 'config.yaml'
!onmt_train -config config.yaml


[2023-07-10 17:22:34,659 INFO] Missing transforms field for train data, set to default: [].
[2023-07-10 17:22:34,659 INFO] Missing transforms field for valid data, set to default: [].
[2023-07-10 17:22:34,659 INFO] Parsed 2 corpora from -data.
[2023-07-10 17:22:34,660 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-07-10 17:22:34,674 INFO] Reading encoder and decoder embeddings from /content/nmt/glove_dir/glove.6B.300d.txt
[2023-07-10 17:22:41,102 INFO] 	Found 400000 total vectors in file
[2023-07-10 17:22:41,103 INFO] After filtering to vectors in vocab:
[2023-07-10 17:22:41,104 INFO] 	* enc: 2179 match, 2109 missing, (50.82%)
[2023-07-10 17:22:41,105 INFO] 	* dec: 2179 match, 2109 missing, (50.82%)
[2023-07-10 17:22:41,105 INFO] 
Saving encoder embeddings as:
	* enc: /content/nmt/nmtmodel.enc_embeddings.pt
[2023-07-10 17:22:42,772 INFO] 
Saving decoder embeddings as:
	* dec: /content/nmt/nmtmodel.dec_embeddings.pt
[2023-07-10 17:22:44,554 INFO] The first 10 to

In [14]:
# List the contents of the 'model_root' directory
!ls '{model_root}'


model_step_1000.pt  model_step_200.pt  model_step_600.pt  src.vocab
model_step_100.pt   model_step_300.pt  model_step_700.pt  train.log
model_step_1100.pt  model_step_400.pt  model_step_800.pt
model_step_1200.pt  model_step_500.pt  model_step_900.pt


# Translate

In [15]:
# Perform translation using the trained NMT model
# --model specifies the path to the trained model checkpoint
# --src specifies the path to the source text file to be translated
# --output specifies the path to save the translated output
# -beam_size specifies the beam size for beam search
!onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src /content/nmt/monument80/test.en --output /content/nmt/monument80/trans_test.sparql -beam_size 4


[2023-07-10 17:52:15,445 INFO] Loading checkpoint from /content/nmt/nmtmodel/model_step_1200.pt
[2023-07-10 17:52:15,999 INFO] Loading data into the model
[2023-07-10 17:53:45,616 INFO] PRED SCORE: -0.0998, PRED PPL: 1.10 NB SENTENCES: 1479


In [16]:
# Display the first 5 lines of the file 'test.en'
!head -n 5 /content/nmt/monument80/test.en


building date of villa la reine jeanne
what do nelson's column and the patchwork girl of oz have in common
where can one find rizal monument
what's the oldest monument of răzeni
how many place are there in kandy


In [17]:
# Display the first 5 lines of the file 'test.en'
!head -n 5 /content/nmt/monument80/trans_test.sparql

select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
select wildcard where brack_open brack_open dbr_Nelson's_Column,_Montreal var_a var_b sep_dot dbr_Vladimir_Atlasov var_a var_b brack_close UNION brack_open var_c var_d dbr_Nelson's_Column,_Montreal sep_dot var_c var_d dbr_Vladimir_Atlasov brack_close brack_close
select var_a where brack_open dbr_Rizal_Monument_ par_open Calamba par_close dbo_location var_a brack_close
select var_a where brack_open var_a rdf_type dbo_Monument sep_dot var_a dbo_location dbr_Răzeni sep_dot var_a dbp_complete var_c brack_close order by var_c limit 1
select count par_open wildcard par_close where brack_open var_a rdf_type dbo_Place sep_dot var_a dbo_location dbr_Kandy brack_close group by var_a


In [18]:
# Display the first 5 lines of the file 'test.sparql'
!head -n 5 /content/nmt/monument80/test.sparql

select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
select wildcard where brack_open brack_open dbr_Nelson's_Column,_Montreal var_a var_b sep_dot dbr_The_Patchwork_Girl_of_Oz var_a var_b brack_close UNION brack_open var_c var_d dbr_Nelson's_Column,_Montreal sep_dot var_c var_d dbr_The_Patchwork_Girl_of_Oz brack_close brack_close
select var_a where brack_open dbr_Rizal_Monument_ par_open Calamba par_close  dbo_location var_a brack_close
select var_a where brack_open var_a rdf_type dbo_Monument sep_dot var_a dbo_location dbr_Răzeni sep_dot var_a dbp_complete var_c brack_close order by var_c limit 1
select count par_open wildcard par_close  where brack_open var_a rdf_type dbo_Place sep_dot var_a dbo_location dbr_Kandy brack_close group by var_a


# Evaluate

In [19]:
# Print the current working directory
!pwd

/content/nmt


In [20]:
# Copy the file "compute-accuracy.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-accuracy.py ./

# Evaluate the translation using the provided accuracy computation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-accuracy.py /content/nmt/monument80/test.sparql /content/nmt/monument80/trans_test.sparql


Reference 1st sentence: select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
MTed 1st sentence: select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
Accuracy:  0.9626578505230069


In [21]:
# Install the sacrebleu library using pip
!pip install sacrebleu > /dev/null

# Copy the file "compute-bleu.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-bleu.py ./

# Evaluate the translation using BLEU score calculation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-bleu.py /content/nmt/monument80/test.sparql /content/nmt/monument80/trans_test.sparql


Reference 1st sentence: select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
MTed 1st sentence: select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
BLEU:  97.76084846468864


In [22]:
# Install the rouge library using pip
!pip install rouge > /dev/null

# Copy the file "compute-rouge-l.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-rouge-l.py ./

# Evaluate the translation using Rouge-L score calculation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-rouge-l.py /content/nmt/monument80/test.sparql /content/nmt/monument80/trans_test.sparql


Reference 1st sentence: select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
MTed 1st sentence: select var_a where brack_open dbr_Villa_La_Reine_Jeanne dbp_complete var_a brack_close
Rouge-L:  0.9872806081210852


In [23]:
# Copy the directory 'nmt' and its contents from '/content/nmt' to '/content/drive/MyDrive'
!cp -r /content/nmt/nmtmodel /content/drive/MyDrive/NMT_models

# Test

In [None]:
sentences = [
    ''' ''',
    ''' '''
]
with open('questions.en', 'w') as fp:
    t = [''.join(x) for x in sentences]
    t = '\n'.join(t)
    fp.write(t)

In [None]:
! onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src questions.en --output pred.sparql

In [None]:
! cat pred.sparql