# Environment

In [1]:
# Create a directory named 'nmt'
!mkdir nmt

# Change the current working directory to the newly created 'nmt' directory
%cd nmt

# Create a subdirectory named 'nmtmodel' inside the 'nmt' directory
!mkdir nmtmodel


/content/nmt


In [2]:
# Install OpenNMT-py along with specific versions of torchvision and torchaudio
! pip install OpenNMT-py torchvision==0.14.1 torchaudio==0.13.1 > /dev/null


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [3]:
# Print the current working directory
!pwd

/content/nmt


# Prepare Dataset

In [4]:
# Copy the file 'monument.zip' from '/content/drive/MyDrive/' to the current directory
!cp /content/drive/MyDrive/monument.zip ./

# Unzip the file 'monument.zip' and extract its contents to the current directory
!unzip ./monument.zip -d ./

# Remove the original zip file 'monument.zip' from '/content/nmt/'
!rm /content/nmt/monument.zip

Archive:  ./monument.zip
   creating: ./monument/
  inflating: ./__MACOSX/._monument   
  inflating: ./monument/dev.en       
  inflating: ./__MACOSX/monument/._dev.en  
  inflating: ./monument/dev.sparql   
  inflating: ./__MACOSX/monument/._dev.sparql  
  inflating: ./monument/.DS_Store    
  inflating: ./__MACOSX/monument/._.DS_Store  
  inflating: ./monument/train.sparql  
  inflating: ./__MACOSX/monument/._train.sparql  
  inflating: ./monument/train.en     
  inflating: ./__MACOSX/monument/._train.en  
  inflating: ./monument/test.sparql  
  inflating: ./__MACOSX/monument/._test.sparql  
  inflating: ./monument/test.en      
  inflating: ./__MACOSX/monument/._test.en  


In [5]:
# List the contents of the current directory
!ls


__MACOSX  monument  nmtmodel


# Knowledge Graph Embedding

In [6]:
# Create a new directory named "graph_embedding_dir" using the mkdir command
!mkdir "graph_embedding_dir"

# Copy the file "embedding.vec" from the source directory in Google Drive
# to the destination directory "/content/nmt/graph_embedding_dir"
!cp /content/drive/MyDrive/embedding.vec /content/nmt/graph_embedding_dir


# Create the Training Configuration File

In [8]:
model_root = '/content/nmt/nmtmodel'

# Create a directory named 'nmtmodel' using the model_root path
!mkdir -p '{model_root}'

In [None]:
# Define the configuration as a formatted string
config = f'''# config.yaml
# GloVe:
# this means embeddings will be used for both encoder and decoder sides
both_embeddings: /content/nmt/graph_embedding_dir/embedding.vec

# supported types: GloVe, word2vec
embeddings_type: "word2vec"

# word_vec_size need to match with the pretrained embeddings dimensions
word_vec_size: 300

## Where the samples will be written
save_data: {model_root}

## Where the vocab(s) will be written
# Vocabulary files, generated by onmt_build_vocab
src_vocab: {model_root}/src.vocab
tgt_vocab: {model_root}/src.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 5000
tgt_vocab_size: 5000
share_vocab: true

# Training files
data:
    train:
        path_src: /content/nmt/monument/train.en
        path_tgt: /content/nmt/monument/train.sparql
    valid:
        path_src: /content/nmt/monument/dev.en
        path_tgt: /content/nmt/monument/dev.sparql

# Where to save the checkpoints
save_model: {model_root}/model
log_file: {model_root}/train.log
save_checkpoint_steps: 100
train_steps: 1200
valid_steps: 400

# Stop training if it does not imporve after n validations
early_stopping: 4

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 4242

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
# queue_size: 100
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 4096   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
# world_size: 1
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
# model_dtype: "fp16"
optim: "adam"
# learning_rate: 2
warmup_steps: 500
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.98
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
# dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

# Write the configuration to a "config.yaml" file
with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

# Build Vocabulary

In [10]:
import os

# Check if the source vocabulary file doesn't exist in the 'model_root' directory
if not os.path.exists(os.path.join(model_root, 'src.vocab')):
    # Build the source vocabulary using the onmt_build_vocab command and the provided config.yaml
    # The --n_sample option is used to indicate the number of samples to consider for building the vocabulary
    # The "|| true" at the end ensures that the command won't stop the script even if there's an error
    !onmt_build_vocab -config config.yaml --n_sample -1 || true


Corpus train's weight should be given. We default it to 1 for you.
[2023-07-10 10:30:21,221 INFO] Counter vocab from -1 samples.
[2023-07-10 10:30:21,221 INFO] n_sample=-1: Build vocab on full datasets.
[2023-07-10 10:30:21,530 INFO] Counters src: 2569
[2023-07-10 10:30:21,530 INFO] Counters tgt: 2058
[2023-07-10 10:30:21,531 INFO] Counters after share:4609


# Check GPU

In [11]:
# Check NVIDIA GPU information using the 'nvidia-smi' command
!nvidia-smi

# Print a separator line and heading for the GPU information
print('\n\n$*****************************************************************************$')
print('GPU:')

# Check and display the GPU devices using 'nvidia-smi -L'
!nvidia-smi -L

# Print a separator line
print('$*****************************************************************************$')

# Print a separator line and heading for GPU-related checks
print('\n\n$*****************************************************************************$')

# Check if CUDA-enabled GPU is available for PyTorch
import torch
print(torch.cuda.is_available())

# Print the name of the first CUDA-enabled GPU
print(torch.cuda.get_device_name(0))

# Get GPU memory information
gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

# Print a separator line
print('$*****************************************************************************$')

# Print system information using 'lsb_release -a'
!lsb_release -a

# Print a separator line
print('$*****************************************************************************$')

# Print kernel version using 'uname -r'
!uname -r

# Print a separator line
print('$*****************************************************************************$')

# Print NVCC (NVIDIA CUDA Compiler) version using 'nvcc --version'
!nvcc --version

# Print a separator line
print('$*****************************************************************************$')

# Check Torch version using Python import
import torch
print(torch.__version__)

# Print a separator line
print('$*****************************************************************************$')

# Display CPU information using 'cat /proc/cpuinfo | grep model\ name'
!cat /proc/cpuinfo | grep model\ name

# Print a separator line
print('$*****************************************************************************$')

# Display total memory information using 'cat /proc/meminfo | grep MemTotal'
!cat /proc/meminfo | grep MemTotal


Mon Jul 10 10:30:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training

In [12]:
# Train the NMT model using the configuration defined in 'config.yaml'
!onmt_train -config config.yaml


[2023-07-10 10:30:26,906 INFO] Missing transforms field for train data, set to default: [].
[2023-07-10 10:30:26,906 INFO] Missing transforms field for valid data, set to default: [].
[2023-07-10 10:30:26,906 INFO] Parsed 2 corpora from -data.
[2023-07-10 10:30:26,906 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-07-10 10:30:26,925 INFO] Reading encoder and decoder embeddings from /content/nmt/graph_embedding_dir/embedding.vec
[2023-07-10 10:34:05,695 INFO] 	Found 8541203 total vectors in file
[2023-07-10 10:34:05,696 INFO] After filtering to vectors in vocab:
[2023-07-10 10:34:05,698 INFO] 	* enc: 3522 match, 1094 missing, (76.30%)
[2023-07-10 10:34:05,699 INFO] 	* dec: 3522 match, 1094 missing, (76.30%)
[2023-07-10 10:34:05,699 INFO] 
Saving encoder embeddings as:
	* enc: /content/nmt/nmtmodel.enc_embeddings.pt
[2023-07-10 10:34:08,889 INFO] 
Saving decoder embeddings as:
	* dec: /content/nmt/nmtmodel.dec_embeddings.pt
[2023-07-10 10:34:12,058 INFO] The firs

In [13]:
# List the contents of the 'model_root' directory
!ls '{model_root}'


model_step_1000.pt  model_step_200.pt  model_step_600.pt  src.vocab
model_step_100.pt   model_step_300.pt  model_step_700.pt  train.log
model_step_1100.pt  model_step_400.pt  model_step_800.pt
model_step_1200.pt  model_step_500.pt  model_step_900.pt


# Translate

In [14]:
# Perform translation using the trained NMT model
# --model specifies the path to the trained model checkpoint
# --src specifies the path to the source text file to be translated
# --output specifies the path to save the translated output
# -beam_size specifies the beam size for beam search
!onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src /content/nmt/monument/test.en --output /content/nmt/monument/trans_test.sparql -beam_size 4


[2023-07-10 11:03:22,971 INFO] Loading checkpoint from /content/nmt/nmtmodel/model_step_1200.pt
[2023-07-10 11:03:23,590 INFO] Loading data into the model
[2023-07-10 11:03:31,934 INFO] PRED SCORE: -0.0938, PRED PPL: 1.10 NB SENTENCES: 100


In [15]:
# Display the first 5 lines of the file 'test.en'
!head -n 5 /content/nmt/monument/test.en


which is longer los angeles police department memorial for fallen officers or national war memorial
how many monument does böyük tağlar have
location of mint clock tower
how many place does foshan have
is ramagrama stupa a monument


In [16]:
# Display the first 5 lines of the file 'trans_test.sparql'
!head -n 5 /content/nmt/monument/trans_test.sparql


select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close par_close brack_close order by var_b limit 1
select count par_open wildcard par_close where brack_open var_a rdf_type dbo_Monument sep_dot var_a dbo_location dbr_Böyük_Tağlar brack_close group by var_a
select var_a where brack_open dbr_Mint_Clock_Tower,_Chennai dbo_location var_a brack_close
select count par_open wildcard par_close where brack_open var_a rdf_type dbo_Place sep_dot var_a dbo_location dbr_Foshan brack_close group by var_a
ask where brack_open dbr_Ramagrama_stupa rdf_type dbo_Monument brack_close


In [17]:
# Display the first 5 lines of the file 'test.sparql'
!head -n 5 /content/nmt/monument/test.sparql

select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
select count par_open wildcard par_close  where brack_open var_a rdf_type dbo_Monument sep_dot var_a dbo_location dbr_Böyük_Tağlar brack_close group by var_a
select var_a where brack_open dbr_Mint_Clock_Tower,_Chennai dbo_location var_a brack_close
select count par_open wildcard par_close  where brack_open var_a rdf_type dbo_Place sep_dot var_a dbo_location dbr_Foshan brack_close group by var_a
ask where brack_open dbr_Ramagrama_stupa rdf_type dbo_Monument brack_close


# Evaluate

In [18]:
# Print the current working directory
!pwd

/content/nmt


In [19]:
# Copy the file "compute-accuracy.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-accuracy.py ./

# Evaluate the translation using the provided accuracy computation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-accuracy.py /content/nmt/monument/test.sparql /content/nmt/monument/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
MTed 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close par_close brack_close order by var_b limit 1
Accuracy:  0.9604200323101777


In [20]:
# Install the sacrebleu library using pip
!pip install sacrebleu > /dev/null

# Copy the file "compute-bleu.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-bleu.py ./

# Evaluate the translation using BLEU score calculation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-bleu.py /content/nmt/monument/test.sparql /content/nmt/monument/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
MTed 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close par_close brack_close order by var_b limit 1
BLEU:  98.18742831478944


In [21]:
# Install the rouge library using pip
!pip install rouge > /dev/null

# Copy the file "compute-rouge-l.py" from "/content/drive/MyDrive/" to the current directory
!cp /content/drive/MyDrive/compute-rouge-l.py ./

# Evaluate the translation using Rouge-L score calculation script
# - The first argument is the path to the reference (gold standard) sparql file
# - The second argument is the path to the translated sparql file
!python compute-rouge-l.py /content/nmt/monument/test.sparql /content/nmt/monument/trans_test.sparql


Reference 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close  par_close  brack_close order by var_b limit 1
MTed 1st sentence: select var_a where brack_open var_a dbp_length var_b sep_dot FILTER par_open var_a = dbr_Los_Angeles_Police_Department_Memorial_for_Fallen_Officers || var_a = dbr_National_War_Memorial_ par_open Canada par_close par_close brack_close order by var_b limit 1
Rouge-L:  0.9914127404721674


In [22]:
# Copy the directory 'nmt' and its contents from '/content/nmt' to '/content/drive/MyDrive'
!cp -r /content/nmt /content/drive/MyDrive


# Test

In [None]:
sentences = [
    ''' ''',
    ''' '''
]
with open('questions.en', 'w') as fp:
    t = [''.join(x) for x in sentences]
    t = '\n'.join(t)
    fp.write(t)

In [None]:
! onmt_translate --model '/content/nmt/nmtmodel/model_step_1200.pt' --src questions.en --output pred.sparql

In [None]:
! cat pred.sparql