In [5]:
import sys
sys.path.append('/scratch/anaconda3/lib/python3.6/site-packages')

In [52]:
import spacy
import torch
from torchtext.data import Field, BucketIterator, TabularDataset
import torchtext.data as data
from gensim.utils import tokenize
import pandas as pd
import googletrans
import glob
from xml.etree.ElementTree import parse

In [11]:
#just some debugging stuff to check tmx
tree = parse('./mono_hi-ne/Nepali_biblecorpus.xml')
root = tree.getroot()[1] #This gets the body of the tmx file


In [29]:
print(len(root))
print(root[0].attrib) # body of the xml file
print(root[0][0].attrib) #Book
print(root[0][0][0].attrib) #chapter
print(root[0][0][0][0].text.strip()) #actual verse

1
{'id': 'Bible', 'lang': 'np'}
{'id': 'b.GEN', 'type': 'book'}
{'id': 'b.GEN.1', 'type': 'chapter'}
आरम्भमा परमेश्वरले आकाश र पृथ्वी सृष्टि गर्नु भयो।


In [82]:

def bibleXMLtoTxt(path):
    tree = parse(path)
    body = tree.getroot()[1][0] #should get you body of da tree
    sentences = []
    for book in body:
        for chapter in book:
            for seg in chapter:
                text = seg.text
                if text is None:
                    continue
                sentences.append(text.strip())
    return sentences

def TMXtoTxt(path, L1, L2):
    tree = parse(path) #you can parse TMX as an xml file
    root = tree.getroot()[1] #should get body of tmx (where pairs are)
    
    data = {L1: [None] * len(root), L2: [None] * len(root)}
    kept = 0
    for i , child in enumerate(root):
        l1 = list(child[0].attrib.keys())[0]
        l1 = child[0].attrib[l1]
        l2 = list(child[1].attrib.keys())[0]
        l2 = child[1].attrib[l2]
        
        assert l1 == L1 and l2 == L2, "seems there is a misalignment of language {} != {} or {} != {}".format(l1, L1, l2, L2)
        if len(child[0][0].text) > 0 and len(child[1][0].text) > 0:
            data[l1][i] = child[0][0].text
            data[l2][i] = child[1][0].text
            kept += 1
    print("number kept {} / {}".format(kept, i))
    
    assert len(data[l1]) == len(data[l2]), "You don't have equal pairs"
    #something like this
    return data    

def checkTSV(path):
    with open(path, encoding='utf-8') as f:
        i = 0
        for l in f:
            pair = l.strip().split('\t')
            if len(pair) > 2 or len(pair) == 1:
                print('\n')
                print(l.strip())
                print(i, pair)
            i += 1
    print(i)
    
def recoverEntry(entries):
    #here are the 2 patterns i've seen europarl (do not use this as a general solution)
    if len(entries) == 3 and len(entries[1]) == 0: #this means we had 2 entries in between each entry
        return [entries[0], entries[-1]]
    elif len(entries) == 3 and (entries[0].strip() in entries[2]): #i have seen this...once, very skeptical if this will work
        return [entries[0] + entries[1], entries[2]]
    elif len(entries) == 3 and (entries[0].strip() not in entries[2]):
        return [entries[1], entries[2]] #again 1 bug that happened in es-pt
    elif len(entries) == 4 and entries[0] == entries[2]: #Weird split from numbers
        return [entries[0] + entries[1], entries[2] + entries[3]]
    else:
        print('failed to recover:')
        print(entries)
        return []
    
def cleanTSV(path, l1, l2, thresh=3):
    results = {l1:[], l2: []}
    with open(path, encoding='utf-8') as f:
        i = 0
        for l in f:
            i += 1
            entries = l.strip().split('\t')
            if len(entries) > 2:
                entries = recoverEntry(entries)
            if len(entries) <= 1:
                continue #there's no pair so just ignore it
            non_empty = len(entries[0]) > 0 and len(entries[1]) > 0
            long_enough = len(entries[0].strip()) > thresh and len(entries[0].strip()) > thresh 
            if len(entries) == 2 and non_empty and long_enough :
                results[l1].append(entries[0])
                results[l2].append(entries[1])
    assert len(results[l1]) == len(results[l2]), "You don't have an equal number of sentences"
    print('Entries kept {} / {}'.format(len(results[l1]), i))
    return results

def merge_all_data(dicts, l1, l2):
    all_data = {l1: [], l2: []}
    for d in dicts:
        all_data[l1] =  all_data[l1] + d[l1].copy()
        all_data[l2] =  all_data[l2] + d[l2].copy()
        assert len(all_data[l1]) == len(all_data[l2]), "ugh oh, unaligned bitext"
    return all_data       

def lookup_words(x, vocab=None):
    if vocab is not None:
        x = [vocab.itos[i] for i in x]
    return [str(t) for t in x]

def buildVocab(sentences, tokenizer,min_freq=1):
    vocab = {}
    total_tokens = 0
    for sent in sentences:
        for t in tokenizer(sent):
            total_tokens += 1
            if t in vocab:
                vocab[t] += 1
            else:
                vocab[t] = 1

    vocab = [k for k in vocab.keys() if vocab[k] > min_freq]
    print('Vocab size {} with min_freq {}'.format(len(vocab), min_freq))
    print('Total tokens counted {}'.format(total_tokens))
    return vocab

def writeSentenceList(sentences, path):
    
    with open(path, encoding='utf-8', mode='w') as f:
        for s in sentences:
            f.write(s + '\n')    

def merge_all(pth, dir, l):
    files = glob.glob(pth)
    out = open(dir + 'all.' + l, mode='w', encoding='utf-8')
    for f in files:
        with open(f, mode='r', encoding='utf-8') as file:
            for l in file:
                out.write(l)
    out.close()

In [51]:
# Hindi - Nepali Cleaning
pth = './mono_hi-ne/'

nepali_bible = bibleXMLtoTxt(pth + 'Nepali_biblecorpus.xml')
print(len(nepali_bible))
writeSentenceList(nepali_bible, pth + 'bible.ne')
hindi_bible = bibleXMLtoTxt( pth + 'Hindi_biblecorpus.xml')
writeSentenceList(hindi_bible, pth + 'bible.hi')
print(len(hindi_bible))
with open(pth + 'hindmonocorp05.plaintext', mode='r', encoding='utf-8') as src, open(pth + 'monocorp05.hi', mode='w', encoding='utf-8') as trg :
    for line in src:
        line = line.split('>')[-1]
        line = line.strip()
        trg.write(line + '\n')


30547
31065


In [87]:
# Hindi - Nepali Merge
pth = './mono_hi-ne/'
merge_all(pth + '*.ne*', pth, 'ne')

In [84]:
#Czech - polish translation data
pth='./mono_cs-pl/'

files = glob.glob(pth + '*.*')
for f in files:
    name = f.split('/')[-1]
    print(name, ext)
    with open(f, mode='r', encoding='utf-8') as f:
        out = open(pth + 'clean-' + name, mode='w', encoding='utf-8')
        for l in f:
            l = l.strip()
            if len(l) > 4:
                out.write(l + '\n')
        out.close()
    
    

europarl-v9.cs cs
news.2018.pl.shuffled.deduped cs
europarl-v9.pl cs
news-commentary-v14.cs cs


In [86]:
#Czech - polish translation data merge monolingual data together
pth='./mono_cs-pl/'
merge_all(pth + 'clean-*cs*', pth, 'cs')
merge_all(pth + 'clean-*pl*', pth, 'pl')

In [92]:
#Spanih Portugese translation
pth = './mono_es-pt/'

files = glob.glob(pth + '*.*')
files = [f for f in files if '_crawl.sh' not in f]
for f in files:
    name = f.split('/')[-1]
    print(name)
    with open(f, mode='r', encoding='utf-8') as f:
        out = open(pth + 'clean-' + name, mode='w', encoding='utf-8')
        for l in f:
            l = l.strip()
            if len(l) > 4:
                out.write(l + '\n')
        out.close()

['./mono_es-pt/news.2012.es.shuffled.deduped', './mono_es-pt/news.2016.pt.shuffled.deduped', './mono_es-pt/news.2010.es.shuffled.deduped', './mono_es-pt/news.2011.pt.shuffled.deduped', './mono_es-pt/news.2018.pt.shuffled.deduped', './mono_es-pt/news-commentary-v14.es', './mono_es-pt/news.2011.es.shuffled.deduped', './mono_es-pt/news.2012.pt.shuffled.deduped', './mono_es-pt/news.2008.es.shuffled.deduped', './mono_es-pt/news.2017.pt.shuffled.deduped', './mono_es-pt/news.2014.pt.shuffled.deduped', './mono_es-pt/news.2009.pt.shuffled.deduped', './mono_es-pt/news.2014.es.shuffled.deduped', './mono_es-pt/news.2015.es.shuffled.deduped', './mono_es-pt/news.2007.es.shuffled.deduped', './mono_es-pt/europarl-v9.es', './mono_es-pt/news.2013.pt.shuffled.deduped', './mono_es-pt/news.2016.es.shuffled.deduped', './mono_es-pt/news.2009.es.shuffled.deduped', './mono_es-pt/news.2008.pt.shuffled.deduped', './mono_es-pt/news-commentary-v14.pt', './mono_es-pt/news.2018.es.shuffled.deduped', './mono_es-pt/eu

In [93]:
#Spanih Portugese merge
pth = './mono_es-pt/'
merge_all(pth + 'clean-*.es*', pth, 'es')
merge_all(pth + 'clean-*.pt*', pth, 'pt')

In [200]:
#Create the Hindi Nepali dataset
#slightly different from all the other ones but not by much 
def getDataFromFile(pth, name, l1, l2):
    l1_l2_dict = {l1:[], l2: []}
    with open(pth + name + '.' + l1, encoding='utf-8') as f:
        l1_l2_dict[l1] = list(f.read().split('\n'))
    with open(pth + name + '.' + l2, encoding='utf-8') as f:
        l1_l2_dict[l2] = list(f.read().split('\n'))
    assert len(l1_l2_dict[l1]) == len(l1_l2_dict[l2]), "ugh oh, unaligned"
    print(len(l1_l2_dict[l1]))
    return l1_l2_dict
pth = './hi-ne/TrainDevSimilar/'

#train set
he_ne_train = getDataFromFile(pth, 'train', 'hi', 'ne')
pd.DataFrame.from_dict(he_ne_train).to_csv(pth + 'clean-train-hi-ne.tsv',sep='\t',index=False)
#dev set 
he_ne_dev = getDataFromFile(pth, 'dev2019', 'hi', 'ne')
pd.DataFrame.from_dict(he_ne_dev).to_csv(pth + 'clean-dev-hi-ne.tsv',sep='\t',index=False)

65506
3001
