In [1]:
import json
import math

In [2]:
f = open('source_target.json')
data = json.load(f)
f.close()

In [3]:
f = open('abb2full.json')
abb2full = json.load(f)
f.close()

In [4]:
# Returns true if str1 is a subsequence of str2 
def isSubSequence(str1,str2): 
    m = len(str1) 
    n = len(str2) 
      
    j = 0    # Index of str1 
    i = 0    # Index of str2 
      
    # Traverse both str1 and str2 
    # Compare current character of str2 with  
    # first unmatched character of str1 
    # If matched, then move ahead in str1 
      
    while j<m and i<n: 
        if str1[j] == str2[i]:     
            j = j+1    
        i = i + 1
          
    # If all characters of str1 matched, then j is equal to m 
    return j==m 

In [5]:
source = list(data.keys())

In [6]:
unigram = set()
bigram = set()
trigram = set()
quatergram = set()

In [7]:
for line in source:
    unigram |= set(line.lower())
    for i in range(len(line)-1):
        bigram.add(line[i:i+2].lower())
    for i in range(len(line)-2):
        trigram.add(line[i:i+3].lower())
    for i in range(len(line)-3):
        quatergram.add(line[i:i+4].lower())

In [8]:
unigram2abb = dict()
bigram2abb = dict()
trigram2abb = dict()
quatergram2abb = dict()
abb_list = list(abb2full.keys())

In [9]:
for uni in unigram:
    unigram2abb[uni] = []
    for abb in abb_list:
        if isSubSequence(uni, abb.lower()):
            unigram2abb[uni].append(abb)

In [10]:
for bi in bigram:
    bigram2abb[bi] = []
    for abb in abb_list:
        if isSubSequence(bi, abb.lower()):
            bigram2abb[bi].append(abb)

In [11]:
for tri in trigram:
    trigram2abb[tri] = []
    for abb in abb_list:
        if isSubSequence(tri, abb.lower()):
            trigram2abb[tri].append(abb)

In [12]:
for quater in quatergram:
    quatergram2abb[quater] = []
    for abb in abb_list:
        if isSubSequence(quater, abb.lower()):
            quatergram2abb[quater].append(abb)

In [13]:
def gram(source, pos, num):
    candidate = []
    for i in range(pos-num+1, pos+1):
        if len(source[i:i+num])==num:
            candidate.append(source[i:i+num])
    return candidate

def source2gram(source):
    grams = []
    for i in range(len(source)):
        uni = gram(source, i, 1)
        bi = gram(source, i, 2)
        tri = gram(source, i, 3)
        quater = gram(source, i, 4)
        grams.append([uni, bi, tri, quater])
    return grams

In [14]:
def source2abbs(source):
    grams = source2gram(source)
    abbs = []
    for gram in grams:
        abb = []
        # quatergram
        for item in gram[3]:
            if item in quatergram2abb:
                abb += quatergram2abb[item]
        if len(abb)>0:
            abbs.append(abb)
            continue
        # trigram
        for item in gram[2]:
            if item in trigram2abb:
                abb += trigram2abb[item]
        if len(abb)>0:
            abbs.append(abb)
            continue
        # bigram
        for item in gram[1]:
            if item in bigram2abb:
                abb += bigram2abb[item]
        if len(abb)>0:
            abbs.append(abb)
            continue
        #unigram
        for item in gram[0]:
            if item in unigram2abb:
                abb += unigram2abb[item]
        abbs.append(abb)
    return abbs

In [15]:
def softmax(v_list):
    exp_list = []
    for v in v_list:
        exp_list.append(math.exp(v))
    sum_exp = sum(exp_list)
    for i,v in enumerate(exp_list):
        exp_list[i] = v/sum_exp
    return exp_list

def sortDictByVal(Dict):
    ## the val will be softmax
    items = []
    vals = sorted(softmax(list(Dict.values())), reverse=True)
    for k,v in Dict.items():
        items.append([k,v])
    items = sorted(items, key=lambda x: x[1], reverse=True)
    for i in range(len(items)):
        items[i][1] = vals[i]
    return items

def source2full(source):
    source_abbs = source2abbs(source)
    source_full = []
    for abb_list in source_abbs:
        full_dict = {}
        for abb in abb_list:
            full_list = abb2full[abb]
            full_text = full_list[0]
            full_score = full_list[1]
            for i in range(len(full_text)):
                if full_text[i] not in full_dict:
                    full_dict[full_text[i]]=0
                full_dict[full_text[i]]+=1-full_score[i]
        full_dict = sortDictByVal(full_dict)
        source_full.append(full_dict)
    return source_full

In [21]:
from tqdm import tqdm
source_full = dict()
for s in tqdm(source):
    source_full[s] = source2full(s)

100%|██████████████████████████████████████████████████████████████████████████████| 860/860 [00:00<00:00, 3253.99it/s]


In [23]:
import json
f = open('source_full.json','w')
json.dump(source_full, f)

In [24]:
f.close()