## Imports & logging

In [1]:
import gensim
import gensim.downloader as api
import logging
from rdflib import Graph, Namespace, Literal
import tempfile
import numpy as np
import pandas as pd
import random

from rdflib import URIRef
from rdflib.namespace import RDF
from rdflib.namespace import RDFS
from rdflib.namespace import SKOS
from tqdm import tqdm
from TqdmToLogger import TqdmToLogger

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

tqdm.pandas()
tqdm_out = TqdmToLogger(logger, level=logging.INFO)

#TODO: use numpy for random stuff?
#https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.RandomState.html
random.seed(0)

In [None]:
logger.setLevel(logging.INFO)

In [None]:
logger.setLevel(logging.DEBUG)

## Load dictionary

In [None]:
gn = api.load('word2vec-google-news-300')

In [2]:
gw1 = api.load("glove-wiki-gigaword-100")

2020-01-23 14:29:03,371 : INFO : loading projection weights from C:\Users\Peter/gensim-data\glove-wiki-gigaword-100\glove-wiki-gigaword-100.gz
2020-01-23 14:30:13,724 : INFO : loaded (400000, 100) matrix from C:\Users\Peter/gensim-data\glove-wiki-gigaword-100\glove-wiki-gigaword-100.gz


In [None]:
gw3 = api.load("glove-wiki-gigaword-300")

In [3]:
wv = gw1

## Load graph

### KBpedia

In [123]:
kb = Graph()
kb.parse("C:/Users/Peter/Uni/MA/KBpedia/kbpedia_reference_concepts.n3", format="n3")
logger.info(f"Loaded graph with {len(kb)} triples")

2020-01-23 15:06:31,803 : INFO : Loaded graph with 730673 triples


In [124]:
g = kb

### DBpedia disjointDomain

In [108]:
db = Graph()
db.parse("C:/Users/Peter/Uni/MA/DBpedia/mappingbased-objects_lang=en_disjointDomain.ttl", format="ttl")
logger.info(f"Loaded graph with {len(db)} triples")

2020-01-23 14:57:50,384 : INFO : Loaded graph with 15323 triples


In [110]:
g=db

### DBpedia disjointRange

In [109]:
db2 = Graph()
db2.parse("C:/Users/Peter/Uni/MA/DBpedia/mappingbased-objects_lang=en_disjointRange.ttl", format="ttl")
logger.info(f"Loaded graph with {len(db2)} triples")

2020-01-23 14:57:53,742 : INFO : Loaded graph with 17461 triples


In [9]:
g=db2

### Own KG

In [70]:
tg = Graph()
tg.parse("KGDemo.ttl", format="ttl")
logger.info(f"Loaded graph with {len(tg)} triples")

2020-01-23 14:53:31,540 : INFO : Loaded graph with 44 triples


In [71]:
g = tg

## Currently unused

## Check dictionary quality

In [None]:
#Compare all dictionaries
dicts = {"gn": gn, "gw1": gw1, "gw3": gw3}

for i in dicts:
    dic = dicts[i]
    
    #Minus means arrow from R to L
    hasCapital = dic["tokyo"] - dic["japan"]
    isCapitalOf = dic["japan"] - dic["tokyo"]
    
    #Calculate results of relation
    est_france = dic["paris"] + isCapitalOf
    est_paris = dic["france"] + hasCapital
    
    print(f"Dictionary: {i}")
    print(f"est_France to France: {dic.cosine_similarities(est_france, [dic['france']])}")
    print(f"est_Paris to Paris: {dic.cosine_similarities(est_paris, [dic['paris']])}")
    print(f"est_France to Paris: {dic.cosine_similarities(est_france, [dic['paris']])}")
    print(f"est_Paris to France: {dic.cosine_similarities(est_paris, [dic['france']])}")
    print(f"Paris to France: {dic.cosine_similarities(dic['paris'], [dic['france']])}")
    print(f"\n")


## Inversion test

In [None]:
print(f"Similarity: {wv.cosine_similarities(wv['white'], [wv['black']])[0]}\n")

print("Inverted node 'white':")
for d in wv.similar_by_vector(vector = -1*wv['white']):
    print(f"Distance of '{d[0]}' to !white: {d[1]}")

## Helper functions

In [94]:
def getPreferredTitle(n, lang="en"):
    label = g.preferredLabel(n, lang=lang)

    #if type(n) is not type(Literal("")): #only labels should be of type literal
    if label == []:
        return n.rsplit('/', 1)[-1].replace('_', ' ').replace(',', '').lower() #TODO: replace "()"?
    else:
        return label[0][1].value.lower()
    #else:
    #    return None

#Returns [vec, isMultipart, multipart-matched-%]
def toVector(n):
    title = getPreferredTitle(n)
    #if title is None:
    #    return [None, None, None]
    
    #In case of multiple words in title use mean of individual vectors
    if " " in title:
        subvecs = []
        count = 0
        hit = 0
        for word in title.split(" "):
            count += 1
            try:
                subvecs += [wv[word]]
                hit += 1
            except KeyError:
                pass
        
        if hit > 0:
            return [sum(subvecs)/hit, True, hit/count]
        else:
            return [None, True, 0]
    else:
        try:
            return [wv[title], False, False]
        except KeyError:
            return [None, False, False]

#Methods to ambiguify nodes and relations
def select(inp, obj={'perc': None, 'num': None}):
    if len(obj) > 1:
        raise TypeError("Please give exactly one of percentage or number")
        
    for val in obj:
        if val == 'perc':
            return inp.sample(frac=obj[val], replace=True)
        else:
            return inp.sample(n=obj[val], replace=True)

def ambiguify(config, nodeVectors, relationVectors):
    out = pd.DataFrame()
    for target in config: #can be "nodes" or "relations"
        for method in config[target]: #matches the name of the method
            for instance in config[target][method]: #once for every instance of the method config
                for val in instance['amount']: #the amount of elements to be changed
                    logger.info(f"Ambiguifying {target} with {method} (parameters: {instance})")
                    if target == 'nodes':
                        inp = nodeVectors
                    else:
                        inp = relationVectors
                    
                    selres = select(inp, instance['amount'])
                    conf = instance.get('param', None)
                    sourceColumn = target[0:-1]
                    rep = pd.DataFrame()
                    
                    rep[['method', 'config', 'source_type', 'source', 'target']] = selres.progress_apply(lambda sel: pd.Series([
                        method,
                        str(instance),
                        sourceColumn,
                        sel[sourceColumn],
                        methods[target][method](sel.copy(), inp.copy(), conf)[sourceColumn].iloc[0]
                    ]), axis=1)
                    
                    out = out.append(rep, ignore_index=True)
    return out

#Modify triple and save as new
def modTriple(row, g2, useObject = False, retry=False):
    if row['source_type'] == 'relation':
        logger.debug("Replacing r")
        fil = fullVectors['p'] == row['source']
    else:
        if useObject:
            logger.debug("Replacing o")
            fil = fullVectors['o'] == row['source']
        else:
            logger.debug("Replacing s")
            fil = fullVectors['s'] == row['source']
    
    res = fullVectors[fil]

    if len(res) == 0:
        if not retry:
            return modTriple(row, g2, useObject= not useObject, retry=True)
        else:
            logger.error(f"Could not find original triple!")
    else:
        rep = res.sample(n=1)

        #add row to graph with changed content
        if row['source_type'] == 'relation':
            g2.add((rep['s'].iloc[0], row['target'], rep['o'].iloc[0]))
            logger.debug(f"{(rep['s'].iloc[0], row['target'], rep['o'].iloc[0])}")
            return (rep['s'].iloc[0], row['target'], rep['o'].iloc[0])
        else:
            if useObject:
                g2.add((rep['s'].iloc[0], rep['p'].iloc[0], row['target']))
                logger.debug(f"{(rep['s'].iloc[0], rep['p'].iloc[0], row['target'])}")
                return (rep['s'].iloc[0], rep['p'].iloc[0], row['target'])
            else:
                g2.add((row['target'], rep['p'].iloc[0], rep['o'].iloc[0]))
                logger.debug(f"{(row['target'], rep['p'].iloc[0], rep['o'].iloc[0])}")
                return (row['target'], rep['p'].iloc[0], rep['o'].iloc[0])
        
def populateAdditions(res, g2):
    logger.info(f"Populatig graph")
    
    out = pd.DataFrame()
    out[['s', 'p', 'o']] = res.progress_apply(lambda row: pd.Series(modTriple(row, g2, useObject=(random.random() >= 0.5))), axis=1)
    
    return out

#Returns percent of ambiguity in the graph
def calculateAmbiguity(fullVectors, nodeVectors, relationVectors):
    #for relations
    weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
    relAmbig = np.average(relationVectors['mean_dist'], weights=weights)
    
    #for nodes
    weights = nodeVectors['total']
    nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)
    
    #mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
    totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])
    
    #transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
    return 1-((1+totAmbig)/2)

## Methods

In [7]:
#Setup methods
def rand(inp, source, conf):
    return source.sample(n=1)

#Find result with specific distance
def dist(inp, source, conf):
    if conf == None:
        dist = 1
    else:
        dist = conf.get('dist', 1)
        
    dist = min(max(dist, 0), len(source.index)-1)
    
    #use pandas to get top-n, if dist is same move inp to the top
    source[['dist', 'isInp']] = source.apply(lambda row: pd.Series([wv.cosine_similarities(list(inp['vec']), [list(row['vec'])])[0], inp[0] == row[0]]), axis = 1)
    source = source.sort_values(by=['dist', 'isInp'], ascending=False)
    
    logger.debug(f"source:\n{source}")
    
    logger.debug(f"dist: {dist}")
    logger.debug(f"choice:\n{source.iloc[[dist]]}")

    return source.iloc[[dist]].drop(['dist', 'isInp'], axis=1)

#Find result with specific closeness
def closeness(inp, source, conf):
    if conf == None:
        closeness = 1
    else:
        closeness = conf.get('closeness', 1)
        
    closeness = min(max(closeness, 0), 2) #0 is equal to the input, 2 is its inverse
    
    #use pandas to get dists
    source[['dist', 'isInp']] = source.apply(lambda row: pd.Series([wv.cosine_similarities(list(inp['vec']), [list(row['vec'])])[0], inp[0] == row[0]]), axis = 1)
    resIndex = source['dist'].add(closeness-1).abs().idxmin()
    
    return source.iloc[[resIndex]].drop(['dist', 'isInp'], axis=1)

#Find result closest to inverse input vector
def negative(inp, source, conf):
    return closeness(inp, source, {'closeness': 2})

methods = {
    'nodes':{
        'random': rand,
        'dist': dist,
        'closeness': closeness,
        'negative': negative
    },
    'relations': {
        'random': rand,
        'dist': dist,
        'closeness': closeness,
        'negative': negative
    }
}

## Vectorisation Functions

In [8]:
def convertGraph(g):
    logger.info(f"Converting graph")
    length = 0
    for s, p, o in g.triples((None, None, None)):
        length += 1

    fullVectors = []
    for s, p, o in tqdm(g.triples((None, None, None)), total=length, file=tqdm_out, mininterval=2):
        fullVectors += [[s, p, o]]

    return pd.DataFrame(data=fullVectors, columns=['s', 'p', 'o'])

def vectorifyGraph(fullVectors):
    logger.info(f"Vecotrifying graph")

    logger.info('Subject vectors')
    fullVectors[['s_vec', 's_is_multipart', 's_multipart_%']] = fullVectors.progress_apply(lambda row: pd.Series(toVector(row['s'])), axis=1)

    logger.info('Object vectors')
    fullVectors[['o_vec', 'o_is_multipart', 'o_multipart_%']] = fullVectors.progress_apply(lambda row: pd.Series(toVector(row['o'])), axis=1)

    logger.info('Relation vectors')
    fullVectors[['r_vec', 'is_zero_vector_relation']] = fullVectors.progress_apply(lambda row: pd.Series([
        row['o_vec']-row['s_vec'] if row['o_vec'] is not None and row['s_vec'] is not None else None,
        np.array_equal(row['o_vec']-row['s_vec'], [0]*len(row['o_vec'])) if row['o_vec'] is not None and row['s_vec'] is not None else False
    ]), axis=1)
    
    return fullVectors

def calculateNodeEstimates(fullVectors, relationVectors):
    logger.info(f"Calculating node estimates")
    
    def helper(row, op, same, other):
        #Select the relation vector if there is one
        relVecs = relationVectors[relationVectors['relation'] == row['p']]
        if len(relVecs) > 0:
            rVec = relVecs['vec'].iloc[0]
        else:
            rVec = None
        
        #Calculate the estimate
        if row[other] is not None and rVec is not None:
            est = op(row[other], rVec)
        else:
            est = None
        
        #Calculate the distance
        if est is not None and row[same] is not None:
            dist = wv.cosine_similarities(row[same], [est])[0]
        else:
            dist = None
        
        return pd.Series([est, dist], dtype='object')

    logger.info(f"Subject estimates")
    fullVectors[['s_est', 's_est_dist']] = fullVectors.progress_apply(helper, args=[np.subtract, 's_vec', 'o_vec'], axis=1)
    
    logger.info(f"Object estimates")
    fullVectors[['o_est', 'o_est_dist']] = fullVectors.progress_apply(helper, args=[np.add, 'o_vec', 's_vec'], axis=1)
    
    #Workaround for readability as pandas is equaling NaN and None
    fullVectors = fullVectors.astype({'s_est': 'object', 's_est_dist': 'object', 'o_est': 'object', 'o_est_dist': 'object'})
    fullVectors.loc[fullVectors['s_est'].isna(), 's_est'] = None
    fullVectors.loc[fullVectors['s_est_dist'].isna(), 's_est_dist'] = None
    fullVectors.loc[fullVectors['o_est'].isna(), 'o_est'] = None
    fullVectors.loc[fullVectors['o_est_dist'].isna(), 'o_est_dist'] = None
    
    return fullVectors

def generateNodeVectors(fullVectors):
    logger.info(f"Generating nodeVectors")
    
    #Rename and merge
    logger.debug(f"Renaming and merging")
    subjectVectors = fullVectors[['s', 's_vec', 's_is_multipart', 's_multipart_%', 's_est', 's_est_dist']].rename(columns={'s': 'node',
                                                                                                    's_vec': 'vec',
                                                                                                    's_is_multipart': 'is_multipart',
                                                                                                    's_multipart_%': 'multipart_%',
                                                                                                    's_est': 'est',
                                                                                                    's_est_dist': 'est_dist'})
    objectVectors = fullVectors[['o', 'o_vec', 'o_is_multipart', 'o_multipart_%', 'o_est', 'o_est_dist']].rename(columns={'o': 'node',
                                                                                                   'o_vec': 'vec',
                                                                                                   'o_is_multipart': 'is_multipart',
                                                                                                   'o_multipart_%': 'multipart_%',
                                                                                                   'o_est': 'est',
                                                                                                   'o_est_dist': 'est_dist'})
    nodeVectors = pd.concat([subjectVectors, objectVectors], ignore_index=True)
    
    #Remove duplicates
    logger.debug(f"Grouping")
    nodeGroup = nodeVectors.groupby('node')

    logger.debug(f"Using first for vector")
    nodeVectors = nodeGroup.first().reset_index() #TODO: this is really slow
    
    logger.debug(f"Calculating totals")
    nodeVectors['total'] = nodeGroup.size().reset_index(drop=True)
    
    logger.debug(f"Calculating estimates")
    nodeVectors['est'] = nodeGroup['est'].apply(np.mean).reset_index(drop=True)
    
    #Workaround as pandas is equaling NaN and None
    nodeVectors = nodeVectors.astype({'est': 'object', 'vec': 'object'})
    nodeVectors.loc[nodeVectors['est'].isna(), 'est'] = None
    nodeVectors.loc[nodeVectors['vec'].isna(), 'vec'] = None
    
    logger.debug(f"Calculating estimate distances")
    nodeVectors['est_dist'] = nodeVectors.apply(lambda row: wv.cosine_similarities(list(row['est']), [list(row['vec'])])[0] if row['est'] is not None and row['vec'] is not None else None, axis=1)
    
    logger.debug(f"Calculating mean/min/max of distances")
    nodeVectors['mean_est_dist'] = nodeGroup['est_dist'].apply(np.mean).reset_index(drop=True)
    nodeVectors['min_est_dist'] = nodeGroup['est_dist'].apply(np.min).reset_index(drop=True)
    nodeVectors['max_est_dist'] = nodeGroup['est_dist'].apply(np.max).reset_index(drop=True)
    
    #Split into nodeVectors and lostNodes
    logger.debug(f"Splitting into nodeVectors and lostNodes")
    lostNodes = nodeVectors[nodeVectors['vec'].isnull()].reset_index(drop=True).drop(columns=['vec', 'est_dist', 'mean_est_dist', 'min_est_dist', 'max_est_dist'])
    nodeVectors = nodeVectors.dropna().reset_index(drop=True)
    logger.info("Done")
    
    return nodeVectors, lostNodes

def generateRelationVectors(fullVectors):
    logger.info(f"Generating relationVectors")
    
    logger.debug(f"Grouping")
    relationVectors = fullVectors.groupby('p')['r_vec'].apply(np.mean).reset_index().rename(columns={'p': 'relation', 'r_vec': 'vec'})

    logger.debug(f"Calculating total, lost, zero_vector and quality")
    relationVectors['total'] = fullVectors.groupby('p')['p'].count().reset_index(drop=True)
    relationVectors['lost'] = fullVectors.groupby('p')['r_vec'].apply(lambda x: x.isnull().sum()).reset_index(drop=True)
    relationVectors['zero_vector'] = fullVectors.groupby('p')['is_zero_vector_relation'].sum().astype(int).reset_index(drop=True)
    relationVectors['quality'] = relationVectors.apply(lambda row: 1-(row['lost']+row['zero_vector'])/row['total'], axis=1)

    #TODO: labels are counted as lost nodes
    #Split into relationVectors and lostRelations
    logger.debug(f"Splitting into relationVectors and lostRelations")
    lostRelations = relationVectors[np.bitwise_or(
        relationVectors['vec'].isnull(),
        relationVectors['total']-relationVectors['lost']-relationVectors['zero_vector'] == 0
    )].reset_index(drop=True).drop(columns=['vec', 'quality'])
    relationVectors = relationVectors[np.bitwise_and(
        relationVectors['vec'].notnull(),
        relationVectors['total']-relationVectors['lost']-relationVectors['zero_vector'] != 0
    )].reset_index(drop=True)
    
    #Min/max/average distance of every full vector of this relation type to mean vector
    logger.debug(f"Calculating min, max and average distances")
    def helper_dist(row):
        vectors = fullVectors[fullVectors['p'] == row['relation']]

        #filter out None and zero-vector
        vectors = vectors[vectors['is_zero_vector_relation'] == False]
        vectors = vectors['r_vec'].dropna().reset_index(drop=True)

        sims = wv.cosine_similarities(row['vec'], list(vectors))
        return [np.min(sims), np.max(sims), np.mean(sims)]

    relationVectors[['min_dist', 'max_dist', 'mean_dist']] = relationVectors.apply(lambda row: pd.Series(helper_dist(row)), axis=1)
    logger.info("Done")
    
    return relationVectors, lostRelations

## Vector generation

In [125]:
convertedGraph = convertGraph(g)

2020-01-23 15:06:31,852 : INFO : Converting graph
2020-01-23 15:06:35,453 : INFO : 0%|          | 0/730673 [00:00<?, ?it/s]
2020-01-23 15:06:38,035 : INFO : 41%|####1     | 302040/730673 [00:02<00:03, 117059.91it/s]
2020-01-23 15:06:40,036 : INFO : 93%|#########3| 680935/730673 [00:04<00:00, 132203.81it/s]
2020-01-23 15:06:40,324 : INFO : 100%|##########| 730673/730673 [00:04<00:00, 150074.74it/s]


In [None]:
fullVectors = vectorifyGraph(convertedGraph)

2020-01-23 15:06:40,933 : INFO : Vecotrifying graph
2020-01-23 15:06:40,935 : INFO : Subject vectors
100%|██████████| 730673/730673 [08:40<00:00, 1403.41it/s] 
2020-01-23 15:15:21,674 : INFO : Object vectors
100%|██████████| 730673/730673 [14:01<00:00, 867.85it/s]   
2020-01-23 15:29:23,775 : INFO : Relation vectors
 21%|██▏       | 156999/730673 [02:05<05:52, 1626.85it/s] 

In [113]:
relationVectors, lostRelations = generateRelationVectors(fullVectors)

2020-01-23 14:58:38,125 : INFO : Generating relationVectors
2020-01-23 14:58:38,525 : INFO : Done


In [114]:
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)

2020-01-23 14:58:38,533 : INFO : Calculating node estimates
2020-01-23 14:58:38,535 : INFO : Subject estimates
100%|██████████| 15323/15323 [00:32<00:00, 465.31it/s]
2020-01-23 14:59:11,476 : INFO : Object estimates
100%|██████████| 15323/15323 [00:32<00:00, 474.24it/s]


In [115]:
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

2020-01-23 14:59:43,835 : INFO : Generating nodeVectors
2020-01-23 15:01:16,794 : INFO : Done


## View calculated values

### Full vectors

In [116]:
print(f"Length: {len(fullVectors)}")
fullVectors.head(2)

Length: 15323


Unnamed: 0,s,p,o,s_vec,s_is_multipart,s_multipart_%,o_vec,o_is_multipart,o_multipart_%,r_vec,is_zero_vector_relation,s_est,s_est_dist,o_est,o_est_dist
0,http://dbpedia.org/resource/San_Dieguito_Academy,http://dbpedia.org/ontology/district,http://dbpedia.org/resource/San_Dieguito_Union...,"[0.7092967, 0.06972667, -0.16728997, 0.1431166...",True,1,"[0.40800336, 0.07011834, -0.06817766, 0.084918...",True,1,"[-0.30129334, 0.00039166957, 0.09911231, -0.05...",False,"[0.5581451, 0.22082241, -0.19708529, 0.0857053...",0.765528,"[0.559155, -0.0809774, -0.03838235, 0.14232962...",0.738248
1,http://dbpedia.org/resource/John_Dickinson_Hig...,http://dbpedia.org/ontology/district,http://dbpedia.org/resource/Red_Clay_Consolida...,"[0.20313275, 0.2372875, -0.23126301, -0.378601...",True,1,"[-0.046342008, 0.2666352, -0.1983188, -0.06838...",True,1,"[-0.24947476, 0.029347703, 0.032944217, 0.3102...",False,"[0.10379972, 0.41733927, -0.3272264, -0.067597...",0.852485,"[0.052991018, 0.086583436, -0.10235539, -0.379...",0.832069


In [117]:
#Show rows where the relation vector was lost
fullVectors[fullVectors['r_vec'].isnull()].head(2)

Unnamed: 0,s,p,o,s_vec,s_is_multipart,s_multipart_%,o_vec,o_is_multipart,o_multipart_%,r_vec,is_zero_vector_relation,s_est,s_est_dist,o_est,o_est_dist
12,http://dbpedia.org/resource/American_Internati...,http://dbpedia.org/ontology/district,http://dbpedia.org/resource/Voluntari,"[0.35125, 0.32171202, 0.23668961, 0.1441488, 0...",True,1.0,,False,False,,False,,,"[0.20110826, 0.17100795, 0.36559725, 0.1433617...",
24,http://dbpedia.org/resource/Central_High_Schoo...,http://dbpedia.org/ontology/district,http://www.capetigers.com,"[0.26820898, 0.1108925, 0.091522515, -0.069246...",True,0.666667,,False,False,,False,,,"[0.11806725, -0.039811574, 0.22043014, -0.0700...",


In [118]:
#Show rows where the relation vector is a zero-vector
fullVectors[fullVectors['is_zero_vector_relation'] == True].head(2)

Unnamed: 0,s,p,o,s_vec,s_is_multipart,s_multipart_%,o_vec,o_is_multipart,o_multipart_%,r_vec,is_zero_vector_relation,s_est,s_est_dist,o_est,o_est_dist
1445,http://dbpedia.org/resource/Sacramento_Norther...,http://dbpedia.org/ontology/typeOfElectrification,http://dbpedia.org/resource/Sacramento_Norther...,"[-0.18791135, -0.34605, 0.3797767, -0.08740999...",True,1.0,"[-0.18791135, -0.34605, 0.3797767, -0.08740999...",True,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.14952804, -0.32358798, 0.047327846, -0.0316...",0.887354,"[-0.52535075, -0.368512, 0.71222556, -0.143201...",0.792689
1797,http://dbpedia.org/resource/Malmö_Borgarskola,http://dbpedia.org/ontology/district,http://dbpedia.org/resource/Malmö,"[-0.44101, -0.69345, -0.10646, 0.20029, 0.3706...",True,0.5,"[-0.44101, -0.69345, -0.10646, 0.20029, 0.3706...",False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[-0.29086828, -0.5427459, -0.23536763, 0.20107...",0.953179,"[-0.5911517, -0.84415406, 0.022447623, 0.19950...",0.963126


### Node Vectors

In [119]:
print(f"Length: {len(nodeVectors)}")
nodeVectors.head(2)

Length: 20896


Unnamed: 0,node,vec,is_multipart,multipart_%,est,est_dist,total,mean_est_dist,min_est_dist,max_est_dist
0,http://dbpedia.org/resource/'Asir_Province,"[-0.038862, -0.39208, 0.57892, 0.070221, 0.742...",True,0.5,"[-0.5358629, -0.9879053, 0.49990124, -0.509186...",0.582015,1,0.582015,0.582015,0.582015
1,http://dbpedia.org/resource/13th_Street_Repert...,"[0.5617967, 0.050450005, 0.026729502, 0.368527...",True,1.0,"[0.5291708, 0.11075393, 0.079255015, 0.0315063...",0.797982,1,0.797982,0.797982,0.797982


In [120]:
print(f"Length: {len(lostNodes)}")
lostNodes.head(23)

Length: 151


Unnamed: 0,node,is_multipart,multipart_%,est,total
0,http://cfsd.chipfalls.k12.wi.us/,False,False,"[0.22591428, 0.13673343, 0.4512101, 0.04979495...",1
1,http://corp.ptv.vic.gov.au/projects/buses/smar...,False,False,,1
2,http://dbpedia.org/resource/25kVAC,False,False,"[-0.4510494, 0.11067899, 0.73149884, 0.6687438...",1
3,http://dbpedia.org/resource/3kVDC,False,False,"[-0.4510494, 0.11067899, 0.73149884, 0.6687438...",1
4,http://dbpedia.org/resource/Artistic-Athévains,False,False,"[1.1289872, -0.20345074, -0.6858917, 1.3344864...",1
5,http://dbpedia.org/resource/Aspren,False,False,"[0.18928304, 0.008905873, -0.10518439, 0.10023...",1
6,http://dbpedia.org/resource/Assicus,False,False,"[0.111453, 0.21223108, -0.21583664, 0.08741467...",1
7,http://dbpedia.org/resource/Asti-Leku_Ikastola,True,0,"[0.50045294, -0.70178473, -0.5126345, -0.37087...",1
8,http://dbpedia.org/resource/Athracht,False,False,"[0.079071015, 0.12579788, -0.17904666, 0.14626...",1
9,http://dbpedia.org/resource/Austromoine,False,False,"[0.19645303, 0.07153387, 0.0042396113, 0.08816...",1


### Relation Vectors

In [121]:
print(f"Length: {len(relationVectors)}")
relationVectors.head()

Length: 14


Unnamed: 0,relation,vec,total,lost,zero_vector,quality,min_dist,max_dist,mean_dist
0,http://dbpedia.org/ontology/architect,"[-0.104597, -0.29680526, -0.33349532, -0.35633...",4,0,0,1.0,0.696418,0.852506,0.800646
1,http://dbpedia.org/ontology/architecturalStyle,"[-0.068231784, 0.22455311, -0.084630154, 0.674...",6,0,0,1.0,0.308163,0.854841,0.60283
2,http://dbpedia.org/ontology/created,"[-0.56977, -0.1169855, 0.25871, -0.69166005, 0...",1,0,0,1.0,1.0,1.0,1.0
3,http://dbpedia.org/ontology/district,"[-0.15014173, -0.15070407, 0.12890762, -0.0007...",8909,68,4,0.991918,-0.307307,0.923979,0.609536
4,http://dbpedia.org/ontology/governingBody,"[0.18558833, -0.01517868, 0.30702534, -0.19375...",1,0,0,1.0,1.0,1.0,1.0


In [None]:
relationVectors.sort_values(by=['mean_dist'], ascending=False)

In [None]:
print(f"Length: {len(lostRelations)}")
lostRelations.head()

## Run workflow

In [None]:
#Simulate natural ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{
                    'amount': {'num': 10},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 2}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{'amount': {'num': 5},
                 'param': {'dist': 1}}]
    }
}

In [None]:
#Random changes for testing
config = {
    'nodes': {
        'random': [{'amount': {'num': 5000}}]
    },
    'relations': {
        'random': [{'amount': {'num': 2500}}]
    }
}

In [None]:
#Random changes for testing
config = {
    'nodes': {
        'random': [{'amount': {'num': 5}}]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}]
    }
}

In [None]:
#Complete negative for testing
config = {
    'nodes': {
        'negative': [{'amount': {'perc': 1}}]
    },
    'relations': {
        'negative': [{'amount': {'perc': 1}}]
    }
}

In [None]:
#Closeness for testing
config = {
    'nodes': {
        'closeness': [{'amount': {'num': 5},
                 'param': {'closeness': 0.2}}]
    },
    'relations': {
        'closeness': [{'amount': {'num': 5},
                 'param': {'closeness': 0.2}}]
    }
}

In [None]:
#Run imports and define functions, configure the desired result

#Load graph and dictionary
convertedGraph = convertGraph(g)
fullVectors = vectorifyGraph(convertedGraph)
relationVectors, lostRelations = generateRelationVectors(fullVectors)
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

#Check outputs before continuing

#Calculate ambiguity before
ambiguityBefore = calculateAmbiguity(fullVectors, nodeVectors, relationVectors)
logger.info(f"Ambiguity before: {ambiguityBefore}")

In [None]:
#The ambiguify-function returns vectors according to configured methods
res = ambiguify(config, nodeVectors, relationVectors)

In [None]:
res.head(10)

In [None]:
#Insert new node into graph based on one random triple containing the source
g2 = Graph()
additions = populateAdditions(res, g2)

#Calculate ambiguity after:
newFullVectors = fullVectors.copy()

if len(additions) > 0:
    logger.info(f"Adding {len(additions)} additional triples")
    vectorisedAdditions = vectorifyGraph(additions)
    newFullVectors = newFullVectors.append(vectorisedAdditions, ignore_index = True)

newRelationVectors, newLostRelations = generateRelationVectors(newFullVectors)
newFullVectors = calculateNodeEstimates(newFullVectors, newRelationVectors)
newNodeVectors, newLostNodes = generateNodeVectors(newFullVectors)

#Calculate ambiguity after
ambiguityAfter = calculateAmbiguity(newFullVectors, newNodeVectors, newRelationVectors)
logger.info(f"Ambiguity after: {ambiguityAfter}")

logger.info(f"Ambiguity difference: {ambiguityAfter-ambiguityBefore}")



#Save additions from nodes and relations
logger.info(f"Saving files")
f = open("additions.ttl", "wb")
f.write(g2.serialize(format='turtle'))
f.close()

#Save graph with additions
g3 = g+g2
f = open("appendedKG.ttl", "wb")
f.write(g3.serialize(format='turtle'))
f.close()
logger.info(f"Done")

# Comments

In [None]:
#TODO: for all appends: 
#https://stackoverflow.com/questions/50501787/python-pandas-user-warning-sorting-because-non-concatenation-axis-is-not-aligne

#TODO: random seeds

#TODO: check for TODOs

### Calculate ambiguity

In [None]:
#Returns percent of ambiguity in the graph
def calculateAmbiguity(fullVectors, nodeVectors, relationVectors):
    #cFullVectors = fullVectors.copy()
    #cNodeVectors = nodeVectors.copy()
    #cRelationVectors = relationVectors.copy()
    
    #####for relations#####
    
    #group by relation type
        #cross product group with itself
        #apply(..) to calculate distance between both r_vec
        #relation_distance = mean of all distances
    #ambiguity = average weighted by count(all relation_distances)
    #=> ambiguity values from -1..1 where 1 is the least ambiguous

#     def helper1(grp):
#         #returns the relation_distance for each group

#         group = grp.copy().reset_index(drop=False)
#         logger.info(f"Group ({len(group)}):\n{group['p'].iloc[0]}\nColumns ({len(group.columns)}):\n{group.columns}\n")
#         logger.info(f"{group}")
        
#         out = group.apply(helper2, axis=1)
#         logger.info(f"Out:\n{out}\n\n\n\n\n\n\n\n\n")
        
#         return np.mean(out)
    
#     def helper2(row):
#         #returns the relation_distances for one group
#         logger.info(f"Row: {row['p']}")
        
#         r_vecs = cFullVectors[cFullVectors['p'] == row['p']]['r_vec']
#         logger.info(f"r_vecs ({len(r_vecs)})")
        
#         sims = wv.cosine_similarities(row['r_vec'], r_vecs)
        
#         #returns the relation_distance for one row
#         out = np.mean(sims)
        
#         return out
         
        
    #Remove unwanted entries
    #cFullVectors = cFullVectors[cFullVectors['is_zero_vector_relation'] == False].dropna(subset=['r_vec']).reset_index(drop=True)
    
    #Broken because of https://github.com/pandas-dev/pandas/pull/29131
    #relation_distances = fullVectors.groupby('p').apply(helper1)
    
    #Workaround
    #relation_distances = cRelationVectors.apply(lambda row: helper1(fullVectors[fullVectors['p'] == row['relation']]), axis=1)
    
    #print(relation_distances)
    
    
    ###2nd approach
#     print(f"Relation Vectors:\n{cRelationVectors}")
    
#     row = cRelationVectors.iloc[0]
#     print(f"Row: {row}")
#     print(f"Row.vec:\n{row['vec']}")
#     print(f"r_vecs:\n{list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec'])}")
#     print(f"Result:\n{wv.cosine_similarities(row['vec'], list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec']))}")
    
    #cRelationVectors['mean_dist'] = cRelationVectors.apply(lambda row: np.mean(wv.cosine_similarities(row['vec'], list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec']))), axis=1)
    
    weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
    relAmbig = np.average(relationVectors['mean_dist'], weights=weights)
    
    #####for nodes (improved)#####
    
    #for node in nodeVectors:
        #node_estimate = average weighted by relations dist to mean(all connected nodes + their relation to node)
    #ambiguity = average weighted by count of (distance(node_estimate, node))
    #=> ambiguity values from -1..1 where 1 is the least ambiguous
    
    weights = nodeVectors['total']
    nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)
    
    #bad quality of all nodes
    #-> all relation vectors will have high average dist to mean
    
    #bad quality of all relations
    #-> all relation vectors will have high average dist to mean
    
    #bad quality of some nodes
    #-> node_estimate slightly wrong but better if many nodes
    
    #bad quality of some relations
    #-> node_estimate very slightly wrong but better if many nodes and high relation dist to mean
    
    #perfect quality
    #-> node_estimate = node and ambiguity = 1
    
    #do strongly connected nodes influence the outcome more? (they should)
    #-> yes, they are included in more node_estimates
    
    #node and relation are included in each others calculations equally (=once) and only their means are used?
    #-> yes
    
    
    ##final composition & transformation
    #mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
    totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])
    
    #transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
    return 1-((1+totAmbig)/2)

## Tests

In [None]:
data = []
data += [['tokyo', wv['berlin'], 1]]
data += [['tokyo', wv['tokyo'], 2]]
data += [['vienna', wv['vienna'], 3]]
df2 = pd.DataFrame(data, columns=['node', 'vec', 'num'])

df2.head()

In [None]:
data = []
data += [['tokyo', wv['berlin'], 1]]
data += [['tokyo', wv['tokyo'], 2]]
data += [['vienna', wv['vienna'], 3]]
df2 = pd.DataFrame(data, columns=['node', 'vec', 'num'])


df2[['vec2', 'vec3']] = df2.apply(lambda row: pd.Series([toVector(row['node']), None]), axis=1)

df2.head()

In [None]:
#Test mean
df2out = df2.groupby('node').apply(np.mean).reset_index()
df2out['vec'] = df2.groupby('node')['vec'].apply(np.mean).reset_index()['vec']
df2out.head()

## NaN vs None problems

In [None]:
print(f"Python: {float('NaN') is None}")
print(f"Numpy equal(..): {np.equal(float('NaN'), None)}")
print(f"Numpy isnan(..): {np.isnan(float('NaN'))}")
print(f"Pandas isnan(..): {pd.isnull(float('NaN'))}, {pd.isnull(None)}") #Replace python checks with this