In [1]:
#import sys
#sys.path.append('/scratch/anaconda3/lib/python3.6/site-packages')
#sys.path.append('/scratch/michael_git/indic_nlp_library/src')
import spacy
import torch
from torchtext.data import Field, BucketIterator, TabularDataset
import torchtext.data as data
import pandas as pd
import re
from nltk.tokenize import ToktokTokenizer
import glob
import numpy as np
import scipy.stats
from scipy import stats
import matplotlib.pyplot as plt
import sentencepiece as spm
from araNorm import araNorm
pd.__version__

'0.25.0'

In [2]:
def train_BPE_model(bi, mono, lang, vocab_size=20000, savedir='./bpe_models/', add_params=''):
    if not isinstance(bi, list):
        bi = [bi]
    if not isinstance(mono, list):
        mono = [mono]
    
    files = ','.join(bi +  mono)
    print(files)
    m = savedir + lang #no idea if this will work....
    v  = vocab_size
    inputs = '--input={} --model_prefix={} --vocab_size={} --model_type=bpe'.format(files,m, v)
    inputs = inputs + add_params
    print('starting to train ')
    spm.SentencePieceTrainer.Train(inputs) #you have to look at terminal to see output
    print('finished training, trying to load')
    sp = spm.SentencePieceProcessor()
    sp.Load(m + '.model')
    return sp

def convertToBPE(model, file, savefile):
    to_save = open(savefile, modje='w', encoding='utf-8')
    with open(file, mode='r', encoding='utf-8') as f:
        print("Processing {}".format(file))
        for line in f:
            line = model.EncodeAsPieces(line.strip())
            to_save.write(" ".join(line) + '\n')
    to_save.close()
    
def convertFilesToBPE(model, files):
    for f in files:
        name = f.split('/')[-1]
        pth = f.split(name)[0]
        convertToBPE(model, f, pth + 'bpe-' + name )

def loadBPEModel(m):
    sp = spm.SentencePieceProcessor()
    sp.Load(m + '.model')
    return sp

def collect_stats(values):
    return np.mean(values), np.std(values), scipy.stats.mode(values)[0], max(values), min(values)

def CollectStatistics(pth, model):
    sent_lens = [] #defined as white spaces
    bpe_lens = [] 
    
    with open(pth, mode='r', encoding='utf-8') as file:
        i = 0
        for l in file:
            l = l.strip()
            l_split = l.split()
            bpe_split =model.EncodeAsPieces(l)
            sent_lens.append(len(l_split))
            bpe_lens.append(len(bpe_split))
            if i < 10:
                print(l_split)
                print(bpe_split)
            i += 1
    print("count: {}".format(i))                  
    sent_lens = np.array(sent_lens)
    mean, std, mode, max, min = collect_stats(sent_lens)
    s = "mean: {},std: {}, mode: {}, max: {}, min: {}".format(mean, std, mode, max, min)
    print("sentence stats: " + s)
    cap_tok =60
    print("Number of sentences  <= {} tokens: {}".format(cap_tok, np.sum(sent_lens <= cap_tok)))
    bpe_lens = np.array(bpe_lens)
    mean, std, mode, max, min = collect_stats(bpe_lens)
    s = "mean: {},std: {}, mode: {}, max: {}, min: {}".format(mean, std, mode, max, min)
    print("bpe stats: " + s)
    print("Number of bpe  <= {} tokens: {}".format(cap_tok, np.sum(bpe_lens <= cap_tok)))
    
    return sent_lens, bpe_lens

def removeDiacritics(file, directory='./bpe_models/'):
    #this is written for a specific file setup...
    normalizer = araNorm()
    new_file = open(directory + 'no-diacritics' + file, mode='w', encoding='utf-8')
    with open(directory + file, mode='r', encoding='utf-8') as f:
        for line in f:
            line = normalizer.run(line)
            new_file.write(line + '\n')
    new_file.close()
    print('done')


bpe_path = './bpe_models/'

In [3]:
#De -> En
#Separate because training the bpe model takes time
#Ar -> En

b_pth = '../.data/iwslt/de-en/train.de-en.{}'
m_pth = '../.data/iwslt/de-en/train.{}'

# German
b_files = [b_pth.format('de')]
m_files = [] #no arabic
German = train_BPE_model(b_files, m_files, 'german', vocab_size=10000, savedir='../.data/bpe_models/')

# English
b_files = [b_pth.format('en') ]
m_files = []
de_English = train_BPE_model(b_files, m_files, 'de_english', vocab_size=10000, savedir='../.data/bpe_models/')

../.data/iwslt/de-en/train.de-en.de
starting to train 
finished training, trying to load
../.data/iwslt/de-en/train.de-en.en
starting to train 
finished training, trying to load


In [14]:

b_pth = '../.data/iwslt/de-en/train.de-en.{}'
m_pth = '../.data/iwslt/de-en/train.{}'

print('German')
CollectStatistics(b_pth.format('de'), German)
print('English')
CollectStatistics(b_pth.format('en'), de_English)  

German
['David', 'Gallo:', 'Das', 'ist', 'Bill', 'Lange.', 'Ich', 'bin', 'Dave', 'Gallo.']
['▁David', '▁Gall', 'o', ':', '▁Das', '▁ist', '▁Bill', '▁Lange', '.', '▁Ich', '▁bin', '▁Da', 've', '▁Gall', 'o', '.']
['Wir', 'werden', 'Ihnen', 'einige', 'Geschichten', 'über', 'das', 'Meer', 'in', 'Videoform', 'erzählen.']
['▁Wir', '▁werden', '▁Ihnen', '▁einige', '▁Geschichten', '▁über', '▁das', '▁Meer', '▁in', '▁Video', 'form', '▁erzählen', '.']
['Wir', 'haben', 'ein', 'paar', 'der', 'unglaublichsten', 'Aufnahmen', 'der', 'Titanic,', 'die', 'man', 'je', 'gesehen', 'hat,,', 'und', 'wir', 'werden', 'Ihnen', 'nichts', 'davon', 'zeigen.']
['▁Wir', '▁haben', '▁ein', '▁paar', '▁der', '▁unglaublich', 'sten', '▁Aufnahmen', '▁der', '▁Titan', 'ic', ',', '▁die', '▁man', '▁je', '▁gesehen', '▁hat', ',', ',', '▁und', '▁wir', '▁werden', '▁Ihnen', '▁nichts', '▁davon', '▁zeigen', '.']
['Die', 'Wahrheit', 'ist,', 'dass', 'die', 'Titanic', '–', 'obwohl', 'sie', 'alle', 'Kinokassenrekorde', 'bricht', '–', 'nicht'

(array([ 9, 14, 24, ..., 31, 11,  4]), array([17, 17, 32, ..., 42, 15,  5]))

In [4]:
#convert the arabic file to be w/o diatrics
b_pth = '../.data/iwslt/ar-en/'

removeDiacritics('/train.ar-en.ar', b_pth)

done


In [5]:
#Ar -> En

b_pth = '../.data/iwslt/ar-en/no-diacritics/train.ar-en.{}'
#m_pth = './.data/iwslt/ar-en/train.{}'

# Arabic
b_files = [b_pth.format('ar')]
m_files = [] #no arabic
Arabic = train_BPE_model(b_files, m_files, 'arabic', vocab_size=10000, savedir='../.data/bpe_models/')


b_pth = '../.data/iwslt/ar-en/train.ar-en.{}'
# English
b_files = [b_pth.format('en') ]
m_files = []
ar_English = train_BPE_model(b_files, m_files, 'ar_english', vocab_size=10000, savedir='../.data/bpe_models/')


../.data/iwslt/ar-en/no-diacritics/train.ar-en.ar
starting to train 
finished training, trying to load
../.data/iwslt/ar-en/train.ar-en.en
starting to train 
finished training, trying to load


In [8]:
b_pth = '../.data/iwslt/ar-en/train.ar-en.{}'
m_pth = '../.data/iwslt/ar-en/train.{}'

print('Arabic')
CollectStatistics(b_pth.format('ar'), Arabic)
print('English')
CollectStatistics(b_pth.format('en'), ar_English)

Arabic
['ديفيد', 'جالو:', 'هذا', 'بيل', 'لينج.', 'وأنا', 'ديفيد', 'جالو.']
['▁ديفيد', '▁جال', 'و', ':', '▁هذا', '▁بيل', '▁لين', 'ج', '.', '▁و', 'أ', 'نا', '▁ديفيد', '▁جال', 'و', '.']
['وسنقوم', 'بإخباركم', 'ببعض', 'القصص', 'من', 'البحر', 'هُنا', 'في', 'الفيديو.']
['▁وسن', 'قوم', '▁ب', 'إ', 'خ', 'بار', 'كم', '▁ببعض', '▁القصص', '▁من', '▁البحر', '▁ه', 'ُ', 'نا', '▁في', '▁الفيديو', '.']
['لدينا', 'بعض', 'مقاطع', 'فيديو', 'تيتانيك', 'التي', 'لا', 'تصدق', 'ولم', 'يرها', 'أحد', 'إطلاقاً,', 'ونحن', 'لن', 'نستعرض', 'لكم', 'أي', 'منها']
['▁لدينا', '▁بعض', '▁مقا', 'طع', '▁فيديو', '▁ت', 'يت', 'اني', 'ك', '▁التي', '▁لا', '▁تصدق', '▁ولم', '▁ير', 'ها', '▁', 'أ', 'حد', '▁', 'إ', 'طلا', 'قا', 'ً', ',', '▁ونحن', '▁لن', '▁نست', 'عرض', '▁لكم', '▁', 'أ', 'ي', '▁منها']
['وحقيقة', 'الأمر', 'هو', 'أن', 'تيتانيك', '--', 'رغم', 'انه', 'كسر', 'كل', 'أنواع', 'سجلات', 'شباك', 'التذاكر', '--', 'لكنه', 'ليس', 'أكثر', 'إثارة', 'من', 'قصص', 'البحر.']
['▁وح', 'قيق', 'ة', '▁ال', 'أ', 'مر', '▁هو', '▁', 'أ', 'ن', '▁ت', 'ي

(array([ 9, 14, 24, ...,  9,  2,  1]), array([20, 17, 33, ..., 10,  7,  4]))

In [9]:
# because... torch text doesn't combine validation data nicely you have to do it your self -,-
def write_from_file_to_other(filepth, target_file):
    with open(filepth, 'r', encoding='utf-8') as file:
        for l in file:
            if len(l.strip()) == 0:
                continue
            else:
                target_file.write(l.strip() + '\n')
def merge_iwslt_bitext(pth, src, trg, write_pth):
    #get the files
    entries = [p for p in glob.glob(pth) if '.xml' not in p]
    entries = set([e[:-3] for e in entries])
    
    src_file = open(write_pth + '.' + src, 'w', encoding='utf-8')
    trg_file = open(write_pth + '.' + trg, 'w', encoding='utf-8')
    for e in entries:
        print(e + '.' + src)
        write_from_file_to_other(e + '.' + src, src_file)
        write_from_file_to_other(e + '.' + trg, trg_file)
                
    src_file.close()
    trg_file.close()

In [29]:
merge_iwslt_bitext('../.data/iwslt/de-en/IWSLT16.TED*.dev*.de-en.*', 'de', 'en', '../.data/iwslt/de-en/val.de-en')

../.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.de
../.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.de


In [30]:
merge_iwslt_bitext('../.data/iwslt/de-en/IWSLT16.TED*.tst*.de-en.*', 'de', 'en', '../.data/iwslt/de-en/test.de-en')

../.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.de
../.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.de
../.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.de
../.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.de
../.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.de
../.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.de
../.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.de


In [6]:
merge_iwslt_bitext('../.data/iwslt/en-de/IWSLT16.TED*.dev*.en-de.*', \
                   src='en', trg='de', write_pth='../.data/iwslt/en-de/val.en-de')

../.data/iwslt/en-de/IWSLT16.TED.dev2010.en-de.en


In [7]:
merge_iwslt_bitext('../.data/iwslt/en-de/IWSLT16.TED*.tst*.en-de.*', \
                   src='en', trg='de', write_pth='../.data/iwslt/en-de/test.en-de')

../.data/iwslt/en-de/IWSLT16.TED.tst2014.en-de.en
../.data/iwslt/en-de/IWSLT16.TED.tst2010.en-de.en
../.data/iwslt/en-de/IWSLT16.TED.tst2011.en-de.en
../.data/iwslt/en-de/IWSLT16.TED.tst2012.en-de.en
../.data/iwslt/en-de/IWSLT16.TED.tst2013.en-de.en


In [13]:
merge_iwslt_bitext('../.data/iwslt/en-ar/IWSLT16.TED*.dev*.en-ar.*', \
                   src='en', trg='ar', write_pth='../.data/iwslt/en-ar/val.en-ar')

merge_iwslt_bitext('../.data/iwslt/en-ar/IWSLT16.TED*.tst*.en-ar.*', \
                   src='en', trg='ar', write_pth='../.data/iwslt/en-ar/test.en-ar')

../.data/iwslt/en-ar/IWSLT16.TED.dev2010.en-ar.en
../.data/iwslt/en-ar/IWSLT16.TED.tst2010.en-ar.en
../.data/iwslt/en-ar/IWSLT16.TED.tst2012.en-ar.en
../.data/iwslt/en-ar/IWSLT16.TED.tst2011.en-ar.en
../.data/iwslt/en-ar/IWSLT16.TED.tst2013.en-ar.en
../.data/iwslt/en-ar/IWSLT16.TED.tst2014.en-ar.en


In [14]:
merge_iwslt_bitext('../.data/iwslt/ar-en/IWSLT16.TED*.dev*.ar-en.*', \
                   src='ar', trg='en', write_pth='../.data/iwslt/ar-en/val.ar-en')

merge_iwslt_bitext('../.data/iwslt/ar-en/IWSLT16.TED*.tst*.ar-en.*', \
                   src='ar', trg='en', write_pth='../.data/iwslt/ar-en/test.ar-en')

../.data/iwslt/ar-en/IWSLT16.TED.dev2010.ar-en.ar
../.data/iwslt/ar-en/IWSLT16.TED.tst2012.ar-en.ar
../.data/iwslt/ar-en/IWSLT16.TED.tst2013.ar-en.ar
../.data/iwslt/ar-en/IWSLT16.TED.tst2014.ar-en.ar
../.data/iwslt/ar-en/IWSLT16.TED.tst2010.ar-en.ar
../.data/iwslt/ar-en/IWSLT16.TED.tst2011.ar-en.ar
