In [3]:
import os
from align_reader import *
import re
from tqdm import tqdm
from collections import Counter
import pickle

In [2]:
alignment_reader = AlignReaderBible()
languages = set([alignment_reader.lang_prf_map[lang][:3] for lang in alignment_reader.all_langs if lang[:3] != 'eng'])
small_languages = ['bre', 'kat', 'kor', 'plt'] #excluded from the experiment because they did not contain enough verses
unreliable_languages = ['aze','bak','bul','dan','est','fin','heb','hun','hye','kan','kaz','lav','mkd','nld','ron','tat','tel','tgk','tur','urd','uzb'] #silver standard obviously not of sufficient quality
languages.difference_update(small_languages)
languages.difference_update(unreliable_languages)

unimorph_path = '../../unimorph/'

In [7]:
pos_tags = set(['ADJ', 'N'])
regex = re.compile(r'[a-z][A-Z]')
numbers = re.compile(r'[1-9]')
lang_freqs = Counter()
lang_pairs = {}

for lang in tqdm(os.listdir(unimorph_path)):
    lang_dir = unimorph_path + lang + '/'
    if os.path.isdir(lang_dir):
        if lang in languages:
            lang_pairs[lang] = {}
            input = open(lang_dir + lang)
            for line in input:
                if '|' in line or '(' in line or ')' in line or re.search(regex, line):
                    continue
                line = line.strip()
                line = line.split('\t')
                if len(line) == 3:
                    base = line[0].split(' ')[-1]
                    var = line[1].split(' ')[-1]
                    tag = line[2].split(';')[0]

                    if len(var) == 0 or len(base) == 0:
                        print(lang, line)

                    if base[-1] == '-':
                        base = base[:-1]

                    if var[-1] == '-':
                        var = var[:-1]

                    if tag in pos_tags:
                        if base not in lang_pairs[lang]:
                            lang_pairs[lang][base] = Counter()
                        lang_pairs[lang][base][var] += 1

            input.close()


100%|██████████| 154/154 [00:21<00:00,  7.23it/s]


In [8]:
lang_pairs = {lang: pairs for lang, pairs in lang_pairs.items() if len(pairs) > 0}

In [9]:
def common_prefix(all_strings):
    prefixes = Counter()

    for take_out in all_strings.keys():
        strings = list(all_strings.keys())
        if all_strings[take_out] == 1:
            strings.remove(take_out)
            if not strings:
                return ''
            prefix = strings[0]
            for s in strings:
                if len(s) < len(prefix):
                    prefix = prefix[:len(s)]
                if not prefix:
                    return ''
                for i in range(len(prefix)):
                    if prefix[i] != s[i]:
                        prefix = prefix[:i]
                        break

            prefixes[prefix] += 1

    strings = list(all_strings.keys())
    if not strings:
        return ''
    prefix = strings[0]
    for s in strings:
        if len(s) < len(prefix):
            prefix = prefix[:len(s)]
        if not prefix:
            return ''
        for i in range(len(prefix)):
            if prefix[i] != s[i]:
                prefix = prefix[:i]
                break

    prefixes[prefix] += 1 

    return max(prefixes)

In [10]:
ngram_counters = {}
lang_sizes = Counter()
for lang, base_sets in tqdm(lang_pairs.items()):
    ngram_counters[lang] = Counter()
    for base, var_set in base_sets.items():
            if len(var_set) == 1:
                root = base
            else:
                root_candidate = common_prefix(var_set)
                if base != root_candidate:
                    
                    if root_candidate[:len(base)] == base:
                        root = root_candidate
                    elif base[:len(root_candidate)] == root_candidate:
                        root = base
                    else:
                        continue
                else:
                    root = root_candidate


            for var in var_set:
                root_l = len(root)
                if var[:root_l] == root:
                    rest = var[root_l:] + '$'
                else:
                    continue

                ngram_counters[lang][rest] += 1
    

100%|██████████| 19/19 [00:10<00:00,  1.79it/s]


In [11]:
finished_silver_standard = {}
for lang, ngrams in sorted(ngram_counters.items()):
    finished_silver_standard[lang] = set()
    for ngram, count in sorted(ngrams.items(), key=lambda x:x[1], reverse=True):
        if count > 1 and ngram != '$':
            finished_silver_standard[lang].add(ngram)

pickle.dump(finished_silver_standard, open('silver_standard.p', 'wb'))

In [4]:
final_nps = pickle.load(open('/mounts/work/weissweiler/deepcase/pickle/bible/' + 'final_nps.p', 'rb'))

In [9]:
final_nps['42015003']['eng1']['deui']

{(2,), (6,)}