# Prepare

In [None]:
!nvidia-smi

In [None]:
#@title Parameters

#@markdown ### Choose which language to proccess with
language = "de" #@param ["en", "de"] {type:"string"}

#@markdown ### Choose source data file path
source_dir = 'wmt14' #@param {type:"string"}
source_dir = '/content/gdrive/MyDrive/essay_data/' + source_dir

#@markdown ### Choose output data file path
target_dir = '/content/gdrive/MyDrive/essay_data/data_gcn' #@param ["/content/gdrive/MyDrive/essay_data/data_gcn", "data_gcn"]

#@markdown ### Choose which part to proccess
first_part = 0 #@param {type:"number"}
last_part =  1789 #@param {type:"number"}

step = (last_part - first_part) // abs(last_part - first_part)

token = {}
vocab = {}
dep = {}


# Install packages

In [None]:
!pip install mosestokenizer
!pip install stanza

In [None]:
import os
import stanza

In [None]:
stanza.download(language)

In [None]:
import os
if not os.path.exists(language):
  !split -l 2500 -de /{raw_data_dir}/data.{language} ./{language}/{language}_
else:
  print("Data have been downloaded")

Downloading...
From: https://drive.google.com/uc?id=1-48YG87WajVneWP1t362CQyn_iqohqr5
To: /content/raw_data_gcn.zip
100% 468M/468M [00:02<00:00, 201MB/s]
Archive:  raw_data_gcn.zip
   creating: raw_data_gcn/
   creating: raw_data_gcn/.ipynb_checkpoints/
  inflating: raw_data_gcn/data.de    
  inflating: raw_data_gcn/data.en    
  inflating: raw_data_gcn/vocab.de   
  inflating: raw_data_gcn/vocab.en   


In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Moses Tokenizer
According to [WMT14 Translation task](https://www.statmt.org/wmt14/translation-task.html), WMT14 data was allowed to tokenize by ***Moses***.\\
The Following data 

In [None]:
# vim /usr/local/lib/python3.7/dist-packages/mosestokenizer/tokenizer-v1.1.perl
#   - @-@ -> ##AT##-##AT##
!sed -i 's/\\\@-\\\@/\\\#\\\#AT\\\#\\\#-\\\#\\\#AT\\\#\\\#/' /usr/local/lib/python3.7/dist-packages/mosestokenizer/tokenizer-v1.1.perl

In [None]:
from mosestokenizer import *

In [None]:
def normalize(text):
  text = text.replace(' ##AT##-##AT## ', '-')
  text = text.replace('&amp;', '&')
  text = text.replace('&#124;', '|')
  text = text.replace('&lt;', '<')
  text = text.replace('&gt;', '>')
  text = text.replace('&apos;', "'")
  text = text.replace('&quot;', '"')
  text = text.replace('&#91;', '[')
  text = text.replace('&#93;', ']') 
  text = text.replace('##STAR##', '*')
  text = text.replace('##UNDERSCORE##', '_')
  text = text.replace('##AT##', '@')
  return text 

def remove_rich_text_format(lang, filename):
  document = []
  with open(filename, encoding='utf-8', errors='ignore') as f:
    for line in f:
      document.append(normalize(line))
  return document

def tokenize_step(lang, document_split):
  document_tokenized = []
  with MosesTokenizer(lang) as tokenize:
    for sentence in document_split:
      document_tokenized.append(tokenize(sentence))
  # print(f'- tokenize_step')
  # print(document_split)
  # print(document_tokenized)
  return document_tokenized

def split_sentence(lang, document):
  with MosesSentenceSplitter(lang) as splitsents:
      return splitsents([document])

def punctuation_normalize(lang, document_split):
  document_normalized = []
  with MosesPunctuationNormalizer(lang) as normalize:
    for sentence in document_split:
      document_normalized.append(normalize(sentence))
  return document_normalized

def sequence_transform(*steps):
  def func(lang, input):
    for step in steps:
      input = step(lang, input)
    return input
  return func

tokenizer = sequence_transform(remove_rich_text_format, 
                              punctuation_normalize,
                              tokenize_step)

# NLP Parser

In [None]:
nlp = {}
nlp['en'] = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True)
nlp['de'] = stanza.Pipeline(lang='de', processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-07-01 08:01:17 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-07-01 08:01:17 INFO: Use device: gpu
2022-07-01 08:01:17 INFO: Loading: tokenize
2022-07-01 08:01:17 INFO: Loading: pos
2022-07-01 08:01:17 INFO: Loading: lemma
2022-07-01 08:01:17 INFO: Loading: depparse
2022-07-01 08:01:17 INFO: Done loading processors!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-07-01 08:01:18 INFO: Loading these models for language: de (German):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2022-07-01 08:01:18 INFO: Use device: gpu
2022-07-01 08:01:18 INFO: Loading: tokenize
2022-07-01 08:01:18 INFO: Loading: mwt
2022-07-01 08:01:18 INFO: Loading: pos
2022-07-01 08:01:18 INFO: Loading: lemma
2022-07-01 08:01:18 INFO: Loading: depparse
2022-07-01 08:01:18 INFO: Done loading processors!


# Voc2id

In [None]:
def get_token(lang):
  filename = f'{raw_data_dir}/vocab.{lang}'
  with open(filename, encoding='utf-8', errors='ignore') as f:
    for line in f:
      token[lang].append(line.strip())
  token[lang][0] = 'UNK'      # Change <unk> token to UNK
  token[lang].remove('<s>')   # Remove <eos> token from gcn data
  token[lang].remove('</s>')  # Remove <bos> token from gcn data

def replace(doc, word1, word2):
  for i, sentence in enumerate(doc):
    for j, word in enumerate(sentence):
      if word == word1:
        doc[i][j] = word2 
  return doc

import copy
def raw(text1):
  # print('before raw')
  # print(' '.join([' '.join(sent) for sent in text]))
  text = copy.deepcopy(text1)
  text = replace(text, '##AT##-##AT##', '-')
  text = replace(text, '&amp;', '&')
  text = replace(text, '&#124;', '|')
  text = replace(text, '&apos;', "'")
  text = replace(text, '&quot;', '"')
  text = replace(text, '&#91;', '[')
  text = replace(text, '&#93;', ']')
  text = replace(text, '##STAR##', '*')
  text = replace(text, '##UNDERSCORE##', '_')
  text = replace(text, '##AT##', '@')
  # print('after raw')
  # print(' '.join([' '.join(sent) for sent in text]))
  return text 

def build_vocab(token, vocab):
  for i in range(len(token)):
    vocab[token[i]] = {}
    vocab[token[i]]['id'] = i
    vocab[token[i]]['freq'] = 0

In [None]:
def voc2id(vocab, output):
  filepath = os.path.join(output, 'voc2id.txt')
  with open(filepath, 'w') as f:
    for word in vocab:
      id = vocab[word]['id']
      f.write(f'{word}\t{id}\n')

# Frequency

In [None]:
def get_frequency(document, vocab):
  for sent in document:
    for word in sent:
      if word.text not in vocab:
        vocab['UNK']['freq'] += 1
      else:
        vocab[word]['freq'] += 1

def id2freq(vocab, output):
  filepath = os.path.join(output, 'id2freq.txt')
  with open(filepath, 'w') as f:
    for word in vocab:
      id = vocab[word]['id']
      freq = vocab[word]['freq']
      f.write(f'{id}\t{freq}\n')

# Dependencies

In [None]:
def get_deprel(dep):
  with open('/content/gdrive/MyDrive/dep.txt', encoding='utf-8', errors='ignore') as f:
    counter = 0
    for line in f:
      dep[line.strip()] = counter;
      counter+=1

def de2id(dep, output):  
  filepath = os.path.join(output, 'de2id.txt')
  with open(filepath, 'w') as f:
    f.write('UNK\t-1\n')
    f.writelines([f'{deprel}\t{dep[deprel]}\n' for deprel in  dep])

def get_dep(dep, text):
  if text not in dep:
    dep[text] = len(dep)
    filepath = os.path.join('data_gcn', languages[0], 'de2id.txt')
    with open(filepath, 'a') as f:
      f.write(f'{text}\t{dep[text]}\n')
  return dep[text]

# Data

In [None]:
def data(document, token_text, vocab, dep, output):
  with open(output, 'w', encoding='utf-8', errors='ignore') as f:
    for i, sent in enumerate(document.sentences):
      # number of nodes and edges
      f.write(f'{len(sent.words)} {len(sent.words) - 1} ')
      # Print word ids
      for j, word in enumerate(sent.words):
        try:
          # print(f'map: {word.text}\t{token_text[i][j]}')
          word_id = vocab[token_text[i][j]]['id']
          f.write(f'{word_id} ')
        except:
          f.write('0 ')
      # Print edges
      for word in sent.words:
        if word.head > 0: 
            f.write(f'{word.head-1}|{word.id - 1}|{get_dep(dep, word.deprel)} ')
      f.write('\n')

def create_data(filename, lang, output):
  token_text = tokenizer(lang, filename)
  # print(f'before raw: {token_text}')
  document = nlp[lang](raw(token_text))
  # print(f'after_text: {token_text}')

  # get_frequency(token_text, vocab[lang])
  
  data(document, token_text, vocab[lang], dep[lang], output)

# Process data

In [None]:
def reset_data():
  for lang in languages:
    token[lang] = []
    vocab[lang] = {}
    dep[lang] = {}

In [None]:
def data_process(input, output):
  reset_data()
  for lang in languages:
    print(f'Processing {lang}')

    # Prepare input and output directory
    input_dir = os.path.join(input, lang)
    output_dir = os.path.join(output, lang)
    if not os.path.exists(output_dir):
      os.makedirs(output_dir) 

    # Get vocab and dep
    get_token(lang)
    
    # voc2id
    print('- voc2id')
    build_vocab(token[lang], vocab[lang])        
    voc2id(vocab[lang], output_dir)
        
    #de2id
    print('- de2id')
    get_deprel(dep[lang])
    de2id(dep[lang], output_dir)

    print('- data')
    filenames = os.listdir(os.path.join(input, lang))
    for i in range(first_part,last_part+step, step):
      in_filepath = os.path.join(input_dir, filenames[i])
      out_filepath = os.path.join(output_dir, f'data_{i}')
      print(f'Part ({i}/{len(filenames) - 1}): {in_filepath}')
      create_data(in_filepath, lang, out_filepath)

    #id2freq
    print('- id2freq')
    id2freq(vocab[lang], output_dir)

In [None]:
data_process(raw_data_dir, data_dir)

# Fix unknown token

In [None]:
def fix_unknown_token():
  for lang in ['en', 'de']:
    input_dir = os.path.join(raw_data_dir, lang)
    output_dir = os.path.join(data_dir, lang)
    