In [1]:
import numpy as np
import json
import pickle
import os

from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [25]:
with open(os.path.join('data','type_hierarchy_features.json'),'r') as f:
    type_hierarchy = json.load(f)

Will use value from parent for missing child types.
For whole branches missing, will use overall average value.

In [61]:
def get_child_w2v(t, hierarchy):
    '''
    Function for retrieving and finding centroid
    '''
    cl = hierarchy[t]['children']
    w2v = np.zeros(100)
    c=0
    centroids = {}
    lacking = []
    if os.path.isfile(os.path.join('data','T_w2v',t)):
        with open(os.path.join('data','T_w2v',t),'rb') as f:
            fw2v = pickle.load(f)
        c+=len(fw2v)
        w2v += np.sum(checkw2v,axis=0)
    for chld in cl:
        cw2v, cc, dcent, clack = get_child_w2v(chld,hierarchy)
        c+=cc
        w2v += cw2v
        centroids.update(dcent)
        lacking.extend(clack)
    if c!=0:
        centroids[t] = w2v/c
        #apply 
        for lch in lacking:
            centroids[lch] = w2v/c
        lacking = []
    else:
        lacking.append(t)
    return w2v, c, centroids, lacking

In [63]:
first_level_types = type_hierarchy['Activity']['siblings']
lacking_first_level = []
centroids = {}
for t in first_level_types:
    t_w2v, t_c, t_centr, lacking = get_child_w2v(t,type_hierarchy)
    if t_c == 0:
        #adding to list of compeltely lacking branches
        lacking_first_level.extend(lacking)
    centroids.update(t_centr)
avg_cent = np.zeros(100)
for cntr in centroids.values():
    avg_cent += cntr
avg_cent = avg_cent/len(centroids)
for lft in lacking_first_level:
    centroids[lft] = avg_cent

In [68]:
with open(os.path.join('data','t_hier_w2v_centr'),'wb') as f:
    pickle.dump(centroids,f)

In [3]:
def jaccard(set1,set2):
    '''
    Jaccard similarity for two sets.
    '''
    if len(set2) != 0:
        return len(set1.intersection(set2))/len(set1.union(set2))
    else:
        return 0

In [4]:
def check_token(mod,token):
    '''
    Tests if word is stopword, punctuation or if word does not exist in vocabulary.
    '''
    try:
        return mod.vocab[token].is_stop or mod.vocab[token].is_punct
    except:
        return True

In [5]:
def split_doc(doc,s_len=800000):
    '''
    Splits docs at the closes following blank space to the specified length.
    '''
    splits = [0]
    while True:
        s = splits[-1] + s_len
        while True:
            if s >= len(doc):
                splits.append(len(doc))
                return splits
            if doc[s]==' ':
                break
            else:
                s+=1
        splits.append(s)
    return splits

In [82]:
with open(os.path.join('data','document_TC_short.json'), 'r') as f:
    t_docs = json.load(f)

In [85]:
#creating NLP model for tokenizing, removing stopwords, extracting nouns etc.
nlp = English()

In [21]:
def write_ch_unigrams(t, hierarchy):
    '''
    Function for retrieving and finding centroid
    '''
    cl = hierarchy[t]['children']
    lengths = {}
    if os.path.isfile(os.path.join('data','T_unigrams',t)):
        with open(os.path.join('data','T_unigrams',t),'rb') as f:
            t_unigrams = pickle.load(f)
        lacking = []
        lenghts = len(t_unigrams)
    else:
        t_unigrams = set()
        if t in t_docs:
            
            splits = split_doc(t_docs[t]['body']) #splitting text to accomodate nlp model
            for i in range(len(splits)-1):
                nlp_text = nlp(t_docs[t]['body'][splits[i]:splits[i+1]].lower())
                split_tokens=[token.text for token in nlp_text if not check_token(nlp,token.text)]
                t_unigrams.update(split_tokens)
            
        lacking = []
        
        for chl in cl:
            c_unigrams, c_lengths, c_lacking = write_ch_unigrams(chl,hierarchy)
            t_unigrams.update(c_unigrams)
            lengths.update(c_lengths)
            lacking.extend(c_lacking)

        if len(t_unigrams) == 0:
            lacking.append(t)
            
        with open(os.path.join('data','T_unigrams',t),'wb') as f:
            pickle.dump(t_unigrams,f)
        
    lengths[t] = len(t_unigrams)
        
    return t_unigrams, lengths, lacking

In [22]:
lacking = []
lengths = {}
for t in first_level_types:
    _, t_lengths, t_lacking = write_ch_unigrams(t,type_hierarchy)
    lacking.extend(t_lacking)
    lengths.update(t_lengths)


In [154]:
with open(os.path.join('data','lacking_J_term_types'),'wb') as f:
    pickle.dump(lacking,f)

In [26]:
#making lengths after creating files
lengths = {}
for t in type_hierarchy.keys():
    with open(os.path.join('data','T_unigrams',t),'rb') as f:
        t_unigrams = pickle.load(f)
    lengths[t] = len(t_unigrams)
with open(os.path.join('data','t_hier_lengths'),'wb') as f:
    pickle.dump(lengths,f)

In [145]:
with open(os.path.join('data','train_query_w2v_centr'),'rb') as f:
    train_w2v_centr = pickle.load(f)

with open(os.path.join('data','test_query_w2v_centr'),'rb') as f:
    test_w2v_centr = pickle.load(f)

with open(os.path.join('data','validation_query_w2v_centr'),'rb') as f:
    val_w2v_centr = pickle.load(f)
    

with open(os.path.join('data','train_query_unigrams'),'rb') as f:
    train_unigrams = pickle.load(f)

with open(os.path.join('data','test_query_unigrams'),'rb') as f:
    test_unigrams = pickle.load(f)

with open(os.path.join('data','validation_query_unigrams'),'rb') as f:
    val_unigrams = pickle.load(f)

In [167]:
Q_T_features = {}
with open(os.path.join('data','t_hier_w2v_centr'),'rb') as f:
    t_wv = pickle.load(f)
for t in type_hierarchy.keys():
    if os.path.isfile(os.path.join('data','T_features',t)):
        continue
    print('Starting type  -{}-'.format(t))
    with open(os.path.join('data','T_unigrams',t),'rb') as f:
        t_unigrams = pickle.load(f)
    Q_T_features={'train':{},'test':{},'val':{}}
    Q_T_features['train']['SIMAGGR'] = cosine_similarity(train_w2v_centr,[t_wv[t]])
    Q_T_features['test']['SIMAGGR'] = cosine_similarity(test_w2v_centr,[t_wv[t]])
    Q_T_features['val']['SIMAGGR'] = cosine_similarity(val_w2v_centr,[t_wv[t]])
    
    Q_T_features['train']['JTERMS'] = [jaccard(q_u,t_unigrams) for q_u in train_unigrams]
    Q_T_features['test']['JTERMS'] = [jaccard(q_u,t_unigrams) for q_u in test_unigrams]
    Q_T_features['val']['JTERMS'] = [jaccard(q_u,t_unigrams) for q_u in val_unigrams]
    
    with open(os.path.join('data','T_features',t),'wb') as f:
        pickle.dump(Q_T_features,f)

Starting type -Agent-
Starting type -Mill-
Starting type -Painting-
Starting type -OlympicResult-
Starting type -SportsTeamMember-
Starting type -MilitaryUnit-
Starting type -PublicService-
Starting type -AnimangaCharacter-
Starting type -SoccerTournament-
Starting type -Disease-
Starting type -LiteraryGenre-
Starting type -Grape-
Starting type -BodyOfWater-
Starting type -RaceTrack-
Starting type -HistoricBuilding-
Starting type -Monument-
Starting type -Artist-
Starting type -HorseTrainer-
Starting type -Canoeist-
Starting type -GeopoliticalOrganisation-
Starting type -MouseGene-
Starting type -GrossDomesticProductPerCapita-
Starting type -ElectionDiagram-
Starting type -Quote-
Starting type -Orphan-
Starting type -AmericanFootballTeam-
Starting type -StillImage-
Starting type -GaelicGamesPlayer-
Starting type -Novel-
Starting type -Skater-
Starting type -Curler-
Starting type -GovernmentalAdministrativeRegion-
Starting type -Garden-
Starting type -ClassicalMusicArtist-
Starting type

In [33]:
full_Q_T_features = {}
for t in type_hierarchy.keys():
    with open(os.path.join('data','T_features',t),'rb') as f:
        t_features = pickle.load(f)
    full_Q_T_features[t] = t_features

In [62]:
#calculate average fatures in training set
with open(os.path.join('data','lacking_J_term_types'), 'rb') as f:
     lacking = pickle.load(f)

avg_JTERMS = {}

for ds in ['train','test','val']:
    sum_JTERMS = np.zeros(len(full_Q_T_features[t][ds]['JTERMS']))
    c_JTERMS = 0
    for t in full_Q_T_features:
        if t not in lacking:
            sum_JTERMS += np.array(full_Q_T_features[t][ds]['JTERMS'])
            c_JTERMS+=1
    avg_JTERMS[ds]=sum_JTERMS/c_JTERMS

In [91]:
def set_JTERMS(t,ds,features,avg_JTERMS,lack_list):
    '''
    Calculates average JTERMS value for siblings or returns global average
    value if siblings lack JTERMS values.
    '''
    siblings = type_hierarchy[t]['siblings']
    sum_JTERMS = np.zeros(len(features[t][ds]['JTERMS']))
    c_JTERMS = 0
    for t in siblings:
        if t not in lack_list:
            sum_JTERMS += np.array(features[t][ds]['JTERMS'])
            c_JTERMS += 1
    if c_JTERMS >0:
        return list(sum_JTERMS/c_JTERMS)
    else:
        return list(avg_JTERMS[ds])

In [92]:
#setting JTERMS values for types lacking in dataset
for ds in ['train','val','test']:
    for t in lacking:
        full_Q_T_features[t][ds]['JTERMS'] = set_JTERMS(t,ds,full_Q_T_features,avg_JTERMS,lacking)

In [93]:
with open(os.path.join('data','Q_T_features'),'wb') as f:
    pickle.dump(full_Q_T_features,f)