## Imports & logging

In [None]:
import gensim
import gensim.downloader as api
import logging
from rdflib import Graph, Namespace, Literal
from rdflib.plugins.parsers import trig
import tempfile
import numpy as np
import pandas as pd

from rdflib import URIRef
from rdflib.namespace import RDF
from rdflib.namespace import RDFS
from rdflib.namespace import SKOS
from tqdm import tqdm
from TqdmToLogger import TqdmToLogger
import matplotlib
import matplotlib.pyplot as plt
import json

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

tqdm.pandas()
tqdm_out = TqdmToLogger(logger, level=logging.INFO)

np.random.seed(0)

In [None]:
logger.setLevel(logging.INFO)

In [None]:
logger.setLevel(logging.DEBUG)

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
#pd.reset_option('max_columns')
#pd.set_option('max_colwidth', None)

## Load dictionary

In [None]:
gn = api.load('word2vec-google-news-300')

In [None]:
gw1 = api.load("glove-wiki-gigaword-100")

In [None]:
wv = gw1

In [None]:
gw3 = api.load("glove-wiki-gigaword-300")

## Load graph

### KBpedia

In [None]:
kb = Graph()
kb.parse("C:/Users/Peter/gensim-data/KBpedia/kbpedia_reference_concepts.n3", format="n3")
logger.info(f"Loaded graph with {len(kb)} triples")

In [None]:
g = kb

### DBpedia disjointDomain

In [None]:
db = Graph()
db.parse("C:/Users/Peter/gensim-data/DBpedia/mappingbased-objects_lang=en_disjointDomain.ttl", format="ttl")
logger.info(f"Loaded graph with {len(db)} triples")

In [None]:
g=db

### DBpedia disjointRange

In [None]:
db2 = Graph()
db2.parse("C:/Users/Peter/gensim-data/DBpedia/mappingbased-objects_lang=en_disjointRange.ttl", format="ttl")
logger.info(f"Loaded graph with {len(db2)} triples")

In [None]:
g=db2

### Own KG

In [None]:
tg = Graph()
tg.parse("KGDemo.ttl", format="ttl")
logger.info(f"Loaded graph with {len(tg)} triples")

In [None]:
g = tg

### Pokemon KG

In [None]:
pg = Graph()
pg.parse("pokemon.nq", format="nquads")
logger.info(f"Loaded graph with {len(pg)} triples")

In [None]:
g = pg

###  Webster

In [None]:
wg = Graph()
wg.parse("WBT_DSR_LC_model_XML.rdf")
logger.info(f"Loaded graph with {len(pg)} triples")

In [None]:
g = wg

### Beatles

In [None]:
bg = Graph()
bg.parse("beatles.ttl", format="ttl")
logger.info(f"Loaded graph with {len(bg)} triples")

In [None]:
g = bg

### Diet

In [None]:
dg = Graph()
dg.parse("diet.ttl", format="ttl")
logger.info(f"Loaded graph with {len(dg)} triples")

In [None]:
g = dg

### Ingredients

In [None]:
ig = Graph()
ig.parse("ingredients.ttl", format="ttl")
logger.info(f"Loaded graph with {len(ig)} triples")

In [None]:
g = ig

### sparql

In [None]:
sg = Graph()
sg.parse("sparql", format="ttl")
logger.info(f"Loaded graph with {len(sg)} triples")

In [None]:
g = sg

## Helper functions

In [None]:
def getPreferredTitle(n, lang="en"):
    label = g.preferredLabel(n, lang=lang)

    #if type(n) is not type(Literal("")): #only labels should be of type literal
    if label == []:
        return n.rsplit('/', 1)[-1].replace('_', ' ').replace(',', '').lower() #TODO: replace "()"?
    else:
        return label[0][1].value.lower()
    #else:
    #    return None

#Returns [vec, isMultipart, multipart-matched-%]
def toVector(n):
    title = getPreferredTitle(n)
    #if title is None:
    #    return [None, None, None]
    
    #In case of multiple words in title use mean of individual vectors
    if " " in title:
        subvecs = []
        count = 0
        hit = 0
        for word in title.split(" "):
            count += 1
            try:
                subvecs += [wv[word]]
                hit += 1
            except KeyError:
                pass
        
        if hit > 0:
            return [sum(subvecs)/hit, True, hit/count]
        else:
            return [None, True, 0]
    else:
        try:
            return [wv[title], False, False]
        except KeyError:
            return [None, False, False]

#Methods to ambiguify nodes and relations
def select(inp, obj={'perc': None, 'num': None}):
    if len(obj) > 1:
        raise TypeError("Please give exactly one of percentage or number")
        
    for val in obj:
        if val == 'perc':
            return inp.sample(frac=obj[val], replace=True)
        else:
            return inp.sample(n=obj[val], replace=True)

def ambiguify(config, nodeVectors, relationVectors):
    out = pd.DataFrame()
    for target in config: #can be "nodes" or "relations"
        for method in config[target]: #matches the name of the method
            for instance in config[target][method]: #once for every instance of the method config
                for val in instance['amount']: #the amount of elements to be changed
                    logger.info(f"Ambiguifying {target} with {method} (parameters: {instance})")
                    if target == 'nodes':
                        inp = nodeVectors
                    else:
                        inp = relationVectors
                    
                    selres = select(inp, instance['amount'])
                    conf = instance.get('param', None)
                    sourceColumn = target[0:-1]
                    rep = pd.DataFrame()
                    
                    rep[['method', 'config', 'source_type', 'source', 'target']] = selres.progress_apply(lambda sel: pd.Series([
                        method,
                        str(instance),
                        sourceColumn,
                        sel[sourceColumn],
                        methods[target][method](sel.copy(), inp.copy(), conf)[sourceColumn].iloc[0]
                    ]), axis=1)
                    
                    out = out.append(rep, ignore_index=True)
    return out

#Modify triple and save as new
def modTriple(row, g2, useObject = False, retry=False):
    if row['source_type'] == 'relation':
        logger.debug("Replacing r")
        fil = fullVectors['p'] == row['source']
    else:
        if useObject:
            logger.debug("Replacing o")
            fil = fullVectors['o'] == row['source']
        else:
            logger.debug("Replacing s")
            fil = fullVectors['s'] == row['source']
    
    res = fullVectors[fil]

    if len(res) == 0:
        if not retry:
            return modTriple(row, g2, useObject= not useObject, retry=True)
        else:
            logger.error(f"Could not find original triple!")
    else:
        rep = res.sample(n=1)

        #add row to graph with changed content
        if row['source_type'] == 'relation':
            g2.add((rep['s'].iloc[0], row['target'], rep['o'].iloc[0]))
            logger.debug(f"{(rep['s'].iloc[0], row['target'], rep['o'].iloc[0])}")
            return (rep['s'].iloc[0], row['target'], rep['o'].iloc[0], rep['s'].iloc[0], rep['p'].iloc[0], rep['o'].iloc[0])
        else:
            if useObject:
                g2.add((rep['s'].iloc[0], rep['p'].iloc[0], row['target']))
                logger.debug(f"{(rep['s'].iloc[0], rep['p'].iloc[0], row['target'])}")
                return (rep['s'].iloc[0], rep['p'].iloc[0], row['target'], rep['s'].iloc[0], rep['p'].iloc[0], rep['o'].iloc[0])
            else:
                g2.add((row['target'], rep['p'].iloc[0], rep['o'].iloc[0]))
                logger.debug(f"{(row['target'], rep['p'].iloc[0], rep['o'].iloc[0])}")
                return (row['target'], rep['p'].iloc[0], rep['o'].iloc[0], rep['s'].iloc[0], rep['p'].iloc[0], rep['o'].iloc[0])
        
def populateAdditions(res, g2):
    logger.info(f"Populatig graph")
    
    out = pd.DataFrame()
    out[['s', 'p', 'o', 's_orig', 'p_orig', 'o_orig']] = res.progress_apply(lambda row: pd.Series(modTriple(row, g2, useObject=(np.random.random() >= 0.5))), axis=1)
    
    res[['s_orig', 'p_orig', 'o_orig']] = out[['s_orig', 'p_orig', 'o_orig']]
    out = out.drop(columns={'s_orig', 'p_orig', 'o_orig'})
    
    return out

#Returns percent of ambiguity in the graph
def calculateAmbiguity(fullVectors, nodeVectors, relationVectors):
    #for relations
    weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
    relAmbig = np.average(relationVectors['mean_dist'], weights=weights)
    
    #for nodes
    weights = nodeVectors['total']
    nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)
    
    #mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
    totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])
    
    #transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
    return 1-((1+totAmbig)/2)

## Methods

In [None]:
##### Setup methods
def rand(inp, source, conf):
    return source.sample(n=1)

#Find result with specific distance
def dist(inp, source, conf):
    if conf == None:
        dist = 1
    else:
        dist = conf.get('dist', 1)
        
    dist = min(max(dist, 0), len(source.index)-1)
    
    #use pandas to get top-n, if dist is same move inp to the top
    source[['dist', 'isInp']] = source.apply(lambda row: pd.Series([wv.cosine_similarities(list(inp['vec']), [list(row['vec'])])[0], inp[0] == row[0]]), axis = 1)
    source = source.sort_values(by=['dist', 'isInp'], ascending=False)
    
    logger.debug(f"source:\n{source}")
    
    logger.debug(f"dist: {dist}")
    logger.debug(f"choice:\n{source.iloc[[dist]]}")

    return source.iloc[[dist]].drop(['dist', 'isInp'], axis=1)

#Find result with specific closeness
def closeness(inp, source, conf):
    if conf == None:
        closeness = 1
    else:
        closeness = conf.get('closeness', 1)
        
    closeness = min(max(closeness, 0), 2) #0 is equal to the input, 2 is its inverse
    
    #use pandas to get dists
    source[['dist', 'isInp']] = source.apply(lambda row: pd.Series([wv.cosine_similarities(list(inp['vec']), [list(row['vec'])])[0], inp[0] == row[0]]), axis = 1)
    resIndex = source['dist'].add(closeness-1).abs().idxmin()
    
    return source.iloc[[resIndex]].drop(['dist', 'isInp'], axis=1)

#Find result closest to inverse input vector
def negative(inp, source, conf):
    return closeness(inp, source, {'closeness': 2})

methods = {
    'nodes':{
        'random': rand,
        'dist': dist,
        'closeness': closeness,
        'negative': negative
    },
    'relations': {
        'random': rand,
        'dist': dist,
        'closeness': closeness,
        'negative': negative
    }
}

## Vectorisation Functions

In [None]:
def convertGraph(g):
    logger.info(f"Converting graph")
    length = 0
    for s, p, o in g.triples((None, None, None)):
        length += 1

    fullVectors = []
    for s, p, o in tqdm(g.triples((None, None, None)), total=length, file=tqdm_out, mininterval=2):
        fullVectors += [[s, p, o]]

    return pd.DataFrame(data=fullVectors, columns=['s', 'p', 'o'])

def vectorifyGraph(fullVectors):
    logger.info(f"Vecotrifying graph")

    logger.info('Subject vectors')
    fullVectors[['s_vec', 's_is_multipart', 's_multipart_%']] = fullVectors.progress_apply(lambda row: pd.Series(toVector(row['s'])), axis=1)

    logger.info('Object vectors')
    fullVectors[['o_vec', 'o_is_multipart', 'o_multipart_%']] = fullVectors.progress_apply(lambda row: pd.Series(toVector(row['o'])), axis=1)

    logger.info('Relation vectors')
    fullVectors[['r_vec', 'is_zero_vector_relation']] = fullVectors.progress_apply(lambda row: pd.Series([
        row['o_vec']-row['s_vec'] if row['o_vec'] is not None and row['s_vec'] is not None else None,
        np.array_equal(row['o_vec']-row['s_vec'], [0]*len(row['o_vec'])) if row['o_vec'] is not None and row['s_vec'] is not None else False
    ]), axis=1)
    
    return fullVectors

def calculateNodeEstimates(fullVectors, relationVectors):
    logger.info(f"Calculating node estimates")
    
    def helper(row, op, same, other):
        #Select the relation vector if there is one
        relVecs = relationVectors[relationVectors['relation'] == row['p']]
        if len(relVecs) > 0:
            rVec = relVecs['vec'].iloc[0]
        else:
            rVec = None
        
        #Calculate the estimate
        if row[other] is not None and rVec is not None:
            est = op(row[other], rVec)
        else:
            est = None
        
        #Calculate the distance
        if est is not None and row[same] is not None:
            dist = wv.cosine_similarities(row[same], [est])[0]
        else:
            dist = None
        
        return pd.Series([est, dist], dtype='object')

    logger.info(f"Subject estimates")
    fullVectors[['s_est', 's_est_dist']] = fullVectors.progress_apply(helper, args=[np.subtract, 's_vec', 'o_vec'], axis=1)
    
    logger.info(f"Object estimates")
    fullVectors[['o_est', 'o_est_dist']] = fullVectors.progress_apply(helper, args=[np.add, 'o_vec', 's_vec'], axis=1)
    
    #Workaround for readability as pandas is equaling NaN and None
    fullVectors = fullVectors.astype({'s_est': 'object', 's_est_dist': 'object', 'o_est': 'object', 'o_est_dist': 'object'})
    fullVectors.loc[fullVectors['s_est'].isna(), 's_est'] = None
    fullVectors.loc[fullVectors['s_est_dist'].isna(), 's_est_dist'] = None
    fullVectors.loc[fullVectors['o_est'].isna(), 'o_est'] = None
    fullVectors.loc[fullVectors['o_est_dist'].isna(), 'o_est_dist'] = None
    
    return fullVectors

def generateNodeVectors(fullVectors):
    logger.info(f"Generating nodeVectors")
    
    #Rename and merge
    logger.debug(f"Renaming and merging")
    subjectVectors = fullVectors[['s', 's_vec', 's_is_multipart', 's_multipart_%', 's_est', 's_est_dist']].rename(columns={'s': 'node',
                                                                                                    's_vec': 'vec',
                                                                                                    's_is_multipart': 'is_multipart',
                                                                                                    's_multipart_%': 'multipart_%',
                                                                                                    's_est': 'est',
                                                                                                    's_est_dist': 'est_dist'})
    objectVectors = fullVectors[['o', 'o_vec', 'o_is_multipart', 'o_multipart_%', 'o_est', 'o_est_dist']].rename(columns={'o': 'node',
                                                                                                   'o_vec': 'vec',
                                                                                                   'o_is_multipart': 'is_multipart',
                                                                                                   'o_multipart_%': 'multipart_%',
                                                                                                   'o_est': 'est',
                                                                                                   'o_est_dist': 'est_dist'})
    nodeVectors = pd.concat([subjectVectors, objectVectors], ignore_index=True)
    
    #Remove duplicates
    logger.debug(f"Grouping")
    nodeGroup = nodeVectors.groupby('node')

    logger.debug(f"Using first for vector")
    #nodeVectors = nodeGroup.first().reset_index() #this is really slow
    nodeVectors = nodeGroup.head(1).reset_index(drop=True)
    
    logger.debug(f"Calculating totals")
    nodeVectors['total'] = nodeGroup.size().reset_index(drop=True)
    
    logger.debug(f"Calculating estimates")
    nodeVectors['est'] = nodeGroup['est'].apply(np.mean).reset_index(drop=True)
    
    #Workaround as pandas is equaling NaN and None
    nodeVectors = nodeVectors.astype({'est': 'object', 'vec': 'object'})
    nodeVectors.loc[nodeVectors['est'].isna(), 'est'] = None
    nodeVectors.loc[nodeVectors['vec'].isna(), 'vec'] = None
    
    logger.debug(f"Calculating estimate distances")
    nodeVectors['est_dist'] = nodeVectors.apply(lambda row: wv.cosine_similarities(list(row['est']), [list(row['vec'])])[0] if row['est'] is not None and row['vec'] is not None else None, axis=1)
    
    logger.debug(f"Calculating mean/min/max of distances")
    nodeVectors['mean_est_dist'] = nodeGroup['est_dist'].apply(np.mean).reset_index(drop=True)
    nodeVectors['min_est_dist'] = nodeGroup['est_dist'].apply(np.min).reset_index(drop=True)
    nodeVectors['max_est_dist'] = nodeGroup['est_dist'].apply(np.max).reset_index(drop=True)
    
    #Split into nodeVectors and lostNodes
    logger.debug(f"Splitting into nodeVectors and lostNodes")
    lostNodes = nodeVectors[nodeVectors['vec'].isnull()].reset_index(drop=True).drop(columns=['vec', 'est_dist', 'mean_est_dist', 'min_est_dist', 'max_est_dist'])
    nodeVectors = nodeVectors.dropna().reset_index(drop=True)
    logger.info("Done")
    
    return nodeVectors, lostNodes

def generateRelationVectors(fullVectors):
    logger.info(f"Generating relationVectors")
    
    logger.debug(f"Grouping")
    relationVectors = fullVectors.groupby('p')['r_vec'].apply(np.mean).reset_index().rename(columns={'p': 'relation', 'r_vec': 'vec'})

    logger.debug(f"Calculating total, lost, zero_vector and quality")
    relationVectors['total'] = fullVectors.groupby('p')['p'].count().reset_index(drop=True)
    relationVectors['lost'] = fullVectors.groupby('p')['r_vec'].apply(lambda x: x.isnull().sum()).reset_index(drop=True)
    relationVectors['zero_vector'] = fullVectors.groupby('p')['is_zero_vector_relation'].sum().astype(int).reset_index(drop=True)
    relationVectors['quality'] = relationVectors.apply(lambda row: 1-(row['lost']+row['zero_vector'])/row['total'], axis=1)

    #TODO: labels are counted as lost nodes
    #Split into relationVectors and lostRelations
    logger.debug(f"Splitting into relationVectors and lostRelations")
    lostRelations = relationVectors[np.bitwise_or(
        relationVectors['vec'].isnull(),
        relationVectors['total']-relationVectors['lost']-relationVectors['zero_vector'] == 0
    )].reset_index(drop=True).drop(columns=['vec', 'quality'])
    relationVectors = relationVectors[np.bitwise_and(
        relationVectors['vec'].notnull(),
        relationVectors['total']-relationVectors['lost']-relationVectors['zero_vector'] != 0
    )].reset_index(drop=True)
    
    #Min/max/average distance of every full vector of this relation type to mean vector
    logger.debug(f"Calculating min, max and average distances")
    def helper_dist(row):
        vectors = fullVectors[fullVectors['p'] == row['relation']]

        #filter out None and zero-vector
        vectors = vectors[vectors['is_zero_vector_relation'] == False]
        vectors = vectors['r_vec'].dropna().reset_index(drop=True)

        sims = wv.cosine_similarities(row['vec'], list(vectors))
        return [np.min(sims), np.max(sims), np.mean(sims)]

    relationVectors[['min_dist', 'max_dist', 'mean_dist']] = relationVectors.apply(lambda row: pd.Series(helper_dist(row)), axis=1)
    logger.info("Done")
    
    return relationVectors, lostRelations

## Vector generation

In [None]:
convertedGraph = convertGraph(g)

In [None]:
fullVectors = vectorifyGraph(convertedGraph)

In [None]:
relationVectors, lostRelations = generateRelationVectors(fullVectors)

In [None]:
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)

In [None]:
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

## Save & load dataframes

## View calculated values

### Full vectors

In [None]:
print(f"Length: {len(fullVectors)}")
fullVectors.head(5)

In [None]:
#Show rows where the relation vector was lost
fullVectors[fullVectors['r_vec'].isnull()].head(2)

In [None]:
#Show rows where the relation vector is a zero-vector
fullVectors[fullVectors['is_zero_vector_relation'] == True].head(2)

### Node Vectors

In [None]:
print(f"Length: {len(nodeVectors)}")
nodeVectors.head()

In [None]:
print(f"Length: {len(lostNodes)}")
lostNodes.head(2)

### Relation Vectors

In [None]:
print(f"Length: {len(relationVectors)}")
relationVectors.head(37)

In [None]:
print(f"Length: {len(lostRelations)}")
lostRelations.head(3)

## Manual calculations for verification

### Full vectors

In [None]:
#Multipart
print(nodeVectors['is_multipart'].iloc[20])
print(nodeVectors['multipart_%'].iloc[20])

In [None]:
#Zero-Vector
print(fullVectors['is_zero_vector_relation'].iloc[20])

In [None]:
#Lost relation
print(fullVectors['r_vec'].iloc[18])

In [None]:
#Estimate
print(fullVectors['s_est'].iloc[18], fullVectors['s_est_dist'].iloc[18])

#print(fullVectors['s_est'].iloc[0])

#Calculate estimate (minus means arrow from R to L for relationVector calculation)
est = fullVectors['o_vec'].iloc[0]-relationVectors['vec'].iloc[2]
#print(est)

print(wv.cosine_similarities(fullVectors['s_est'].iloc[0], [est])[0])

print(fullVectors['s_est_dist'].iloc[0])
print(wv.cosine_similarities(fullVectors['s_vec'].iloc[0], [est])[0])

### Relations

In [None]:
#Lost relation
print(lostRelations['lost'].iloc[0])
print(lostRelations['zero_vector'].iloc[1])

In [None]:
#Quality
print(relationVectors['quality'].iloc[1])
print((relationVectors['total'].iloc[1]-relationVectors['lost'].iloc[1]-relationVectors['zero_vector'].iloc[1])/relationVectors['total'].iloc[1])

### Nodes

In [None]:
#Unlabeled
np.array_equal(fullVectors['o_vec'].iloc[32], wv['australia'])

In [None]:
#Lost node
lostNodes.iloc[0]

### Methods

In [None]:
#Node with the maximal distance to 'berlin'
nodeVectors.iloc[np.argmin(wv.cosine_similarities(wv['berlin'], list(nodeVectors['vec'])))]['node']

In [None]:
#Dist method

#Same
print(nodeVectors['node'].iloc[0], dist(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'dist': 0})['node'].iloc[0])

#Inverse
print(nodeVectors['node'].iloc[0], dist(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'dist': 100})['node'].iloc[0])

In [None]:
#Closeness method

#Same
print(nodeVectors['node'].iloc[0], closeness(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'closeness': 0})['node'].iloc[0])

#Inverse
print(nodeVectors['node'].iloc[0], closeness(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'closeness': 2})['node'].iloc[0])

### Ambiguity

In [None]:
#Ambiguity
print("0.1191733359168643")

#for relations
weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
relAmbig = np.average(relationVectors['mean_dist'], weights=weights)

print(np.array_equal(weights, [6, 5, 7]))
print(relAmbig)
print((relationVectors['mean_dist'].iloc[0]*6+relationVectors['mean_dist'].iloc[1]*5+relationVectors['mean_dist'].iloc[2]*7)/(6+5+7))

#for nodes
weights = nodeVectors['total']
nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)

#mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])

#transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
print(1-((1+totAmbig)/2))

print(1-((1+(-1))/2))
print(1-((1+(1))/2))

## Run workflow

### Simulation configs

In [None]:
#Simulate small natural ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{
                    'amount': {'num': 10},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 2}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{'amount': {'num': 5},
                 'param': {'dist': 1}}]
    }
}

In [None]:
#Simulate medium natural ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{
                    'amount': {'num': 10},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 2}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{'amount': {'num': 5},
                 'param': {'dist': 1}}]
    }
}

In [None]:
#Simulate large natural ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{
                    'amount': {'num': 10},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 2}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}],
        'dist': [{'amount': {'num': 5},
                 'param': {'dist': 1}}]
    }
}

### Test configs

In [None]:
#Random changes for testing
config = {
    'nodes': {
        'random': [{'amount': {'num': 5000}}]
    },
    'relations': {
        'random': [{'amount': {'num': 2500}}]
    }
}

In [None]:
#Random changes for testing
config = {
    'nodes': {
        'random': [{'amount': {'num': 5}}]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}]
    }
}

In [None]:
#Complete negative for testing
config = {
    'nodes': {
        'negative': [{'amount': {'perc': 1}}]
    },
    'relations': {
        'negative': [{'amount': {'perc': 1}}]
    }
}

In [None]:
#Closeness for testing
config = {
    'nodes': {
        'closeness': [{'amount': {'num': 5},
                 'param': {'closeness': 0.2}}]
    },
    'relations': {
        'closeness': [{'amount': {'num': 5},
                 'param': {'closeness': 0.2}}]
    }
}

### Evaluation config

In [None]:
#Simulate different amounts of ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 5}}],
        'dist': [{
                    'amount': {'num': 5},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 2}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 3}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 4}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 5}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 5}}],
        'dist': [{
                    'amount': {'num': 5},
                    'param': {'dist': 1}
                 },{
                    'amount': {'num': 5},
                    'param': {'dist': 2}
                },{
                    'amount': {'num': 5},
                    'param': {'dist': 3}
                },{
                    'amount': {'num': 5},
                    'param': {'dist': 4}
                },{
                    'amount': {'num': 5},
                    'param': {'dist': 5}
                }]
    }
}

In [None]:
#Simulate different amounts of ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 5}}],
        'dist': [{
                    'amount': {'num': 5},
                     'param': {'dist': 0}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 2}
                 }, {
                     'amount': {'num': 3},
                     'param': {'dist': 3}
                 }, {
                     'amount': {'num': 3},
                     'param': {'dist': 4}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 10}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 50}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 5}}],
        'dist': [{
                    'amount': {'num': 5},
                     'param': {'dist': 0}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 2}
                 }, {
                     'amount': {'num': 3},
                     'param': {'dist': 3}
                 }, {
                     'amount': {'num': 3},
                     'param': {'dist': 4}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 10}
                 }, {
                     'amount': {'num': 2},
                     'param': {'dist': 50}
                 }]
    }
}

### Workflow

In [None]:
#Run imports and define functions, configure the desired result

#Load graph and dictionary, you can skip this if you already ran it
convertedGraph = convertGraph(g)
fullVectors = vectorifyGraph(convertedGraph)
relationVectors, lostRelations = generateRelationVectors(fullVectors)
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

#Check outputs before continuing

In [None]:
#The ambiguify-function returns vectors according to configured methods
np.random.seed(0)
changes = ambiguify(config, nodeVectors, relationVectors)

In [None]:
#Insert new node into graph based on one random triple containing the source
g2 = Graph()
np.random.seed(0)
additions = populateAdditions(changes, g2)

In [None]:
changes.head(100)

In [None]:
additions.head()

In [None]:
#Save additions from nodes and relations
logger.info(f"Saving files")

f = open("additions.ttl", "wb")
f.write(g2.serialize(format='turtle'))
f.close()

#Save graph with additions
g3 = g+g2
f = open("appendedKG.ttl", "wb")
f.write(g3.serialize(format='turtle'))
f.close()

logger.info(f"Done")

In [None]:
#Calculate ambiguity before
ambiguityBefore = calculateAmbiguity(fullVectors, nodeVectors, relationVectors)
logger.info(f"Ambiguity before: {ambiguityBefore}")

#Prepare data to calculate ambiguity after
newFullVectors = fullVectors.copy()

if len(additions) > 0:
    logger.info(f"Adding {len(additions)} additional triples")
    vectorisedAdditions = vectorifyGraph(additions)
    newFullVectors = newFullVectors.append(vectorisedAdditions, ignore_index = True)

newRelationVectors, newLostRelations = generateRelationVectors(newFullVectors)
newFullVectors = calculateNodeEstimates(newFullVectors, newRelationVectors)
newNodeVectors, newLostNodes = generateNodeVectors(newFullVectors)

#Calculate ambiguity after
ambiguityAfter = calculateAmbiguity(newFullVectors, newNodeVectors, newRelationVectors)
logger.info(f"Ambiguity after: {ambiguityAfter}")

logger.info(f"Ambiguity difference: {ambiguityAfter-ambiguityBefore}")

## Export for AMT

In [None]:
logger.setLevel(logging.INFO)

In [None]:
logger.setLevel(logging.DEBUG)

In [None]:
#Generate different severities for same original triple

def ambiguify4amt(config, nodeVectors, relationVectors):
    out = pd.DataFrame()
    for target in config: #can be "nodes" or "relations"
        if target == 'nodes':
            inp = nodeVectors
            selres = nodeSel
        else:
            inp = relationVectors
            selres = relSel
            
        selres = inp.sample(n=5, replace=True) # hardcoded for evaluation
        #print(selres.head(100))
        
        for method in config[target]: #matches the name of the method
            for instance in config[target][method]: #once for every instance of the method config

                logger.debug(f"Selres: {selres}")
                
                for val in instance['amount']: #the amount of elements to be changed
                    logger.info(f"Ambiguifying {target} with {method} (parameters: {instance})")
                    
                    conf = instance.get('param', None)
                    sourceColumn = target[0:-1]
                    rep = pd.DataFrame()
                    
                    rep[['method', 'config', 'source_type', 'source', 'target']] = selres.progress_apply(lambda sel: pd.Series([
                        method,
                        str(instance),
                        sourceColumn,
                        sel[sourceColumn],
                        methods[target][method](sel.copy(), inp.copy(), conf)[sourceColumn].iloc[0]
                    ]), axis=1)
                    
                    out = out.append(rep, ignore_index=True)
    return out

In [None]:
np.random.seed(0)
changes = ambiguify4amt(config, nodeVectors, relationVectors)

g2 = Graph()
np.random.seed(0)
additions = populateAdditions(changes, g2)

In [None]:
#Change format
amt = pd.concat([changes, additions], axis=1)

In [None]:
#Compute names
def getPreferredTitle4amt(n, lang="en"):
    label = g.preferredLabel(n, lang=lang)

    #if type(n) is not type(Literal("")): #only labels should be of type literal
    if label == []:
        return n.rsplit('/', 1)[-1].replace('_', ' ').replace(',', '')
    else:
        return label[0][1].value


amt['s_orig_name'] = amt['s_orig'].apply(getPreferredTitle4amt)
amt['p_orig_name'] = amt['p_orig'].apply(getPreferredTitle4amt)
amt['o_orig_name'] = amt['o_orig'].apply(getPreferredTitle4amt)

amt['s_name'] = amt['s'].apply(getPreferredTitle4amt)
amt['p_name'] = amt['p'].apply(getPreferredTitle4amt)
amt['o_name'] = amt['o'].apply(getPreferredTitle4amt)

#Compute verification code
def ver4amt(row):
    num = np.array([
        np.random.randint(np.min([len(row['s_name']), 5])-1),
        np.random.randint(np.min([len(row['p_name']), 5])-1),
        np.random.randint(np.min([len(row['o_name']), 5])-1)
    ])
    
    code = (row['s_name'][num[0]]+row['p_name'][num[1]]+row['o_name'][num[2]]).replace(' ', '_')

    #print(row[['s_name', 'p_name', 'o_name']], num+1, code)
    return ('-'.join(str(x) for x in (num+1)), code)

np.random.seed(0)
amt[['ver_num', 'ver_code']] = amt.apply(lambda row: pd.Series(ver4amt(row)), axis=1)


In [None]:
amt.head(100)

In [None]:
changes.to_pickle("changes.plk")
additions.to_pickle("additions.plk")
amt.to_pickle("amt.plk")

In [None]:
amt.to_csv("amt.csv", index=False)

## Process AMT results

In [None]:
#amtres = pd.read_csv("Batch_305165_batch_results.csv") #pilot 1
amtres = pd.read_csv("Batch_305480_batch_results.csv") #pilot 2

## Plot graphs

In [None]:
#Set global plot size
plt.rcParams["figure.figsize"] = [10, 8]

In [None]:
def getDistSeverity(sourceType, source, target):
    if sourceType == 'node':
        inp = nodeVectors.copy()
    else:
        inp = relationVectors.copy()
    
    sourceEntry = inp[inp[sourceType] == URIRef(source)]
    
    #print(f"SourceEntry: {sourceEntry.head()}")
    
    inp['dist'] = inp.apply(lambda row: #print(f"{list(sourceEntry['vec'])[0]}, {[list(row['vec'])]}"),
                              wv.cosine_similarities(list(sourceEntry['vec'])[0], [list(row['vec'])])[0]
                              , axis = 1)
    inp = inp.sort_values(by=['dist'], ascending=False).reset_index(drop=True)
    #print(inp.head(5))
    
    targetEntry = inp[inp[sourceType] == URIRef(target)]
    #print(f"{targetEntry.head()}\nIndex: {targetEntry.index[0]}")
        
    return targetEntry.index[0] #TODO: calculate actual value

In [None]:
nodeVectors.iloc[3031]['node']

In [None]:
str(nodeVectors.iloc[3031]['node'])

In [None]:
nodeVectors[nodeVectors['node'] == URIRef('http://dbpedia.org/resource/Robert_Cary_(director)')]

In [None]:
#Extract and format data for plots

amtres['severity'] = amtres.apply(lambda row: getDistSeverity(row['Input.source_type'], row['Input.source'], row['Input.target'])
                                  if row['Input.method'] == 'random'
                                  else (json.loads(row['Input.config'].replace("'", '"'))['param']['dist']
                                        if row['Input.method'] == 'dist'
                                        else -1)
                                  , axis = 1)

amtres['Answer.mistakeSource'] = amtres.apply(lambda row: 'artificial'
                                  if row['Answer.mistakeSource.artificial'] == True
                                  else ( 'human'
                                        if row['Answer.mistakeSource.human'] == True
                                        else 'correct')
                                  , axis = 1)

data = amtres[['Answer.stars', 'WorkTimeInSeconds', 'severity']]#.sort_values(['Answer.stars', 'WorkTimeInSeconds'])

#Input.method
#Input.source_type

In [None]:
amtres.head(100)

In [None]:
#Total answers per rating
amtres[['Answer.stars', 'WorkerId']].groupby('Answer.stars').count().reset_index().rename(columns={'Answer.stars': 'Rating', 'WorkerId': 'count'}).head(50)

In [None]:
#Total answers per rating
plt.plot(amtres[['Answer.stars', 'WorkerId']].groupby('Answer.stars').count())
plt.title('Total answers per rating');
plt.xlabel('Rating')
plt.ylabel('Number of answers')

#TODO: one line per worker and avg

In [None]:
#Total answers per rating
amtres[['Answer.mistakeSource', 'WorkerId', 'Input.method']].groupby(['Input.method', 'Answer.mistakeSource']).count().reset_index().rename(columns={'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource'}).head()

In [None]:
#Answers with wrong code
amtres[amtres['Answer.code'] != amtres['Input.ver_code']].head()

In [None]:
#TODO: one table with wrong/correct per worker
amtCodeCheck = pd.DataFrame()

amtCodeCheck[['WorkerId', 'correct']] = amtres[amtres['Answer.code'] != amtres['Input.ver_code']][['WorkerId', 'Answer.stars']].groupby('WorkerId').count().reset_index()
amtCodeCheck[['WorkerId', 'incorrect']] = amtres[amtres['Answer.code'] == amtres['Input.ver_code']][['WorkerId', 'Answer.stars']].groupby('WorkerId').count().reset_index()

amtCodeCheck.head()

In [None]:
#Mean worktime per rating of all wrong codes
amtres[amtres['Answer.code'] != amtres['Input.ver_code']][['Answer.stars', 'WorkTimeInSeconds']].groupby('Answer.stars').mean().reset_index().head()

In [None]:
#Correlation excluding random
#amtres.groupby('Input.method')[['Answer.stars', 'WorkTimeInSeconds', 'severity']].corr()

amtres[amtres['Input.method'] == 'dist'][['Answer.stars', 'WorkTimeInSeconds', 'severity']].corr()


In [None]:
#Correlation for random
amtres[amtres['Input.method'] == 'random'][['Answer.stars', 'WorkTimeInSeconds', 'severity']].corr()

In [None]:
#Correlation including random
amtres[['Answer.stars', 'WorkTimeInSeconds', 'severity']].corr()

In [None]:
amtres[['Answer.stars', 'severity']].groupby('Answer.stars').head()

In [None]:
#Severities per rating
amtres[['severity', 'Answer.stars']].groupby('Answer.stars').boxplot(subplots=False, rot=90, fontsize=12)
#amtres[['severity', 'Answer.stars']].boxplot()
plt.title("Severities per rating");
#plt.legend(['a', 'b','c'],  loc="upper left")
plt.xlabel("Stars")
plt.ylabel("Severity")

In [None]:
#Rating per severity

amtres[['Answer.stars', 'severity']].groupby('severity').boxplot(subplots=False, rot=90, fontsize=12)
plt.title("Rating per severity");
#plt.legend(['a', 'b','c'],  loc="upper left")
plt.xlabel("Severity")
plt.ylabel("Rating")

In [None]:
#Worktime per severity of intentionally wrong codes
amtres[['WorkTimeInSeconds', 'severity']].groupby('severity').boxplot(subplots=False, rot=90, fontsize=12)
plt.title("Worktime per severity");
#plt.legend(['a', 'b','c'],  loc="upper left")
plt.xlabel("Severity")
plt.ylabel("Worktime [s]")

In [None]:
#Set accept/reject for amt answers and export

#Reject everything with wrong codes


#Reject obviously wrong data
#TODO: how will we define this? Just ignore it? Recognise correct statements

#Export file


## Check dictionary quality

In [None]:
#Compare all dictionaries
dicts = {"gn": gn, "gw1": gw1, "gw3": gw3}

for i in dicts:
    dic = dicts[i]
    
    #Minus means arrow from R to L
    hasCapital = dic["tokyo"] - dic["japan"]
    isCapitalOf = dic["japan"] - dic["tokyo"]
    
    #Calculate results of relation
    est_france = dic["paris"] + isCapitalOf
    est_paris = dic["france"] + hasCapital
    
    print(f"Dictionary: {i}")
    print(f"est_France to France: {dic.cosine_similarities(est_france, [dic['france']])}")
    print(f"est_Paris to Paris: {dic.cosine_similarities(est_paris, [dic['paris']])}")
    print(f"est_France to Paris: {dic.cosine_similarities(est_france, [dic['paris']])}")
    print(f"est_Paris to France: {dic.cosine_similarities(est_paris, [dic['france']])}")
    print(f"Paris to France: {dic.cosine_similarities(dic['paris'], [dic['france']])}")
    print(f"\n")


## Inversion test

In [None]:
print(f"Similarity: {wv.cosine_similarities(wv['white'], [wv['black']])[0]}\n")

print("Inverted node 'white':")
for d in wv.similar_by_vector(vector = -1*wv['white']):
    print(f"Distance of '{d[0]}' to !white: {d[1]}")

## Currently unused

# Comments

In [None]:
#TODO: for all appends: 
#https://stackoverflow.com/questions/50501787/python-pandas-user-warning-sorting-because-non-concatenation-axis-is-not-aligne

#TODO: random seeds

#TODO: check for TODOs

In [None]:
#Returns percent of ambiguity in the graph
def calculateAmbiguity(fullVectors, nodeVectors, relationVectors):
    #cFullVectors = fullVectors.copy()
    #cNodeVectors = nodeVectors.copy()
    #cRelationVectors = relationVectors.copy()
    
    #####for relations#####
    
    #group by relation type
        #cross product group with itself
        #apply(..) to calculate distance between both r_vec
        #relation_distance = mean of all distances
    #ambiguity = average weighted by count(all relation_distances)
    #=> ambiguity values from -1..1 where 1 is the least ambiguous

#     def helper1(grp):
#         #returns the relation_distance for each group

#         group = grp.copy().reset_index(drop=False)
#         logger.info(f"Group ({len(group)}):\n{group['p'].iloc[0]}\nColumns ({len(group.columns)}):\n{group.columns}\n")
#         logger.info(f"{group}")
        
#         out = group.apply(helper2, axis=1)
#         logger.info(f"Out:\n{out}\n\n\n\n\n\n\n\n\n")
        
#         return np.mean(out)
    
#     def helper2(row):
#         #returns the relation_distances for one group
#         logger.info(f"Row: {row['p']}")
        
#         r_vecs = cFullVectors[cFullVectors['p'] == row['p']]['r_vec']
#         logger.info(f"r_vecs ({len(r_vecs)})")
        
#         sims = wv.cosine_similarities(row['r_vec'], r_vecs)
        
#         #returns the relation_distance for one row
#         out = np.mean(sims)
        
#         return out
         
        
    #Remove unwanted entries
    #cFullVectors = cFullVectors[cFullVectors['is_zero_vector_relation'] == False].dropna(subset=['r_vec']).reset_index(drop=True)
    
    #Broken because of https://github.com/pandas-dev/pandas/pull/29131
    #relation_distances = fullVectors.groupby('p').apply(helper1)
    
    #Workaround
    #relation_distances = cRelationVectors.apply(lambda row: helper1(fullVectors[fullVectors['p'] == row['relation']]), axis=1)
    
    #print(relation_distances)
    
    
    ###2nd approach
#     print(f"Relation Vectors:\n{cRelationVectors}")
    
#     row = cRelationVectors.iloc[0]
#     print(f"Row: {row}")
#     print(f"Row.vec:\n{row['vec']}")
#     print(f"r_vecs:\n{list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec'])}")
#     print(f"Result:\n{wv.cosine_similarities(row['vec'], list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec']))}")
    
    #cRelationVectors['mean_dist'] = cRelationVectors.apply(lambda row: np.mean(wv.cosine_similarities(row['vec'], list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec']))), axis=1)
    
    weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
    relAmbig = np.average(relationVectors['mean_dist'], weights=weights)
    
    #####for nodes (improved)#####
    
    #for node in nodeVectors:
        #node_estimate = average weighted by relations dist to mean(all connected nodes + their relation to node)
    #ambiguity = average weighted by count of (distance(node_estimate, node))
    #=> ambiguity values from -1..1 where 1 is the least ambiguous
    
    weights = nodeVectors['total']
    nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)
    
    #bad quality of all nodes
    #-> all relation vectors will have high average dist to mean
    
    #bad quality of all relations
    #-> all relation vectors will have high average dist to mean
    
    #bad quality of some nodes
    #-> node_estimate slightly wrong but better if many nodes
    
    #bad quality of some relations
    #-> node_estimate very slightly wrong but better if many nodes and high relation dist to mean
    
    #perfect quality
    #-> node_estimate = node and ambiguity = 1
    
    #do strongly connected nodes influence the outcome more? (they should)
    #-> yes, they are included in more node_estimates
    
    #node and relation are included in each others calculations equally (=once) and only their means are used?
    #-> yes
    
    
    ##final composition & transformation
    #mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
    totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])
    
    #transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
    return 1-((1+totAmbig)/2)

In [None]:
data = []
data += [['tokyo', wv['berlin'], 1]]
data += [['tokyo', wv['tokyo'], 2]]
data += [['vienna', wv['vienna'], 3]]
df2 = pd.DataFrame(data, columns=['node', 'vec', 'num'])

df2.head()

In [None]:
data = []
data += [['tokyo', wv['berlin'], 1]]
data += [['tokyo', wv['tokyo'], 2]]
data += [['vienna', wv['vienna'], 3]]
df2 = pd.DataFrame(data, columns=['node', 'vec', 'num'])


df2[['vec2', 'vec3']] = df2.apply(lambda row: pd.Series([toVector(row['node']), None]), axis=1)

df2.head()

In [None]:
#Test mean
df2out = df2.groupby('node').apply(np.mean).reset_index()
df2out['vec'] = df2.groupby('node')['vec'].apply(np.mean).reset_index()['vec']
df2out.head()

In [None]:
#NaN vs None problems
print(f"Python: {float('NaN') is None}")
print(f"Numpy equal(..): {np.equal(float('NaN'), None)}")
print(f"Numpy isnan(..): {np.isnan(float('NaN'))}")
print(f"Pandas isnan(..): {pd.isnull(float('NaN'))}, {pd.isnull(None)}") #Replace python checks with this

In [None]:
#NaN vs None problems
print(f"Python: {float('NaN') is None}")
print(f"Numpy equal(..): {np.equal(float('NaN'), None)}")
print(f"Numpy isnan(..): {np.isnan(float('NaN'))}")
print(f"Pandas isnan(..): {pd.isnull(float('NaN'))}, {pd.isnull(None)}") #Replace python checks with this

In [None]:
#Random changes for testing
config = {
    'nodes': {
        'random': [{'amount': {'num': 5}}]
    },
    'relations': {
        'random': [{'amount': {'num': 2}}]
    }
}

In [None]:
#Complete negative for testing
config = {
    'nodes': {
        'negative': [{'amount': {'perc': 1}}]
    },
    'relations': {
        'negative': [{'amount': {'perc': 1}}]
    }
}

In [None]:
#Closeness for testing
config = {
    'nodes': {
        'closeness': [{'amount': {'num': 5},
                 'param': {'closeness': 0.2}}]
    },
    'relations': {
        'closeness': [{'amount': {'num': 5},
                 'param': {'closeness': 0.2}}]
    }
}

In [None]:
#Run imports and define functions, configure the desired result

#Load graph and dictionary
convertedGraph = convertGraph(g)
fullVectors = vectorifyGraph(convertedGraph)
relationVectors, lostRelations = generateRelationVectors(fullVectors)
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

#Check outputs before continuing

#Calculate ambiguity before
ambiguityBefore = calculateAmbiguity(fullVectors, nodeVectors, relationVectors)
logger.info(f"Ambiguity before: {ambiguityBefore}")

In [None]:
#The ambiguify-function returns vectors according to configured methods
np.random.seed(0)
res = ambiguify(config, nodeVectors, relationVectors)

In [None]:
res.head(10)

In [None]:
#Insert new node into graph based on one random triple containing the source
g2 = Graph()
np.random.seed(0)
additions = populateAdditions(res, g2)

#Calculate ambiguity after:
newFullVectors = fullVectors.copy()

if len(additions) > 0:
    logger.info(f"Adding {len(additions)} additional triples")
    vectorisedAdditions = vectorifyGraph(additions)
    newFullVectors = newFullVectors.append(vectorisedAdditions, ignore_index = True)

newRelationVectors, newLostRelations = generateRelationVectors(newFullVectors)
newFullVectors = calculateNodeEstimates(newFullVectors, newRelationVectors)
newNodeVectors, newLostNodes = generateNodeVectors(newFullVectors)

#Calculate ambiguity after
ambiguityAfter = calculateAmbiguity(newFullVectors, newNodeVectors, newRelationVectors)
logger.info(f"Ambiguity after: {ambiguityAfter}")

logger.info(f"Ambiguity difference: {ambiguityAfter-ambiguityBefore}")



#Save additions from nodes and relations
logger.info(f"Saving files")
f = open("additions.ttl", "wb")
f.write(g2.serialize(format='turtle'))
f.close()

#Save graph with additions
g3 = g+g2
f = open("appendedKG.ttl", "wb")
f.write(g3.serialize(format='turtle'))
f.close()
logger.info(f"Done")

# Comments

In [None]:
#TODO: for all appends: 
#https://stackoverflow.com/questions/50501787/python-pandas-user-warning-sorting-because-non-concatenation-axis-is-not-aligne

#TODO: random seeds

#TODO: check for TODOs

In [None]:
#Returns percent of ambiguity in the graph
def calculateAmbiguity(fullVectors, nodeVectors, relationVectors):
    #cFullVectors = fullVectors.copy()
    #cNodeVectors = nodeVectors.copy()
    #cRelationVectors = relationVectors.copy()
    
    #####for relations#####
    
    #group by relation type
        #cross product group with itself
        #apply(..) to calculate distance between both r_vec
        #relation_distance = mean of all distances
    #ambiguity = average weighted by count(all relation_distances)
    #=> ambiguity values from -1..1 where 1 is the least ambiguous

#     def helper1(grp):
#         #returns the relation_distance for each group

#         group = grp.copy().reset_index(drop=False)
#         logger.info(f"Group ({len(group)}):\n{group['p'].iloc[0]}\nColumns ({len(group.columns)}):\n{group.columns}\n")
#         logger.info(f"{group}")
        
#         out = group.apply(helper2, axis=1)
#         logger.info(f"Out:\n{out}\n\n\n\n\n\n\n\n\n")
        
#         return np.mean(out)
    
#     def helper2(row):
#         #returns the relation_distances for one group
#         logger.info(f"Row: {row['p']}")
        
#         r_vecs = cFullVectors[cFullVectors['p'] == row['p']]['r_vec']
#         logger.info(f"r_vecs ({len(r_vecs)})")
        
#         sims = wv.cosine_similarities(row['r_vec'], r_vecs)
        
#         #returns the relation_distance for one row
#         out = np.mean(sims)
        
#         return out
         
        
    #Remove unwanted entries
    #cFullVectors = cFullVectors[cFullVectors['is_zero_vector_relation'] == False].dropna(subset=['r_vec']).reset_index(drop=True)
    
    #Broken because of https://github.com/pandas-dev/pandas/pull/29131
    #relation_distances = fullVectors.groupby('p').apply(helper1)
    
    #Workaround
    #relation_distances = cRelationVectors.apply(lambda row: helper1(fullVectors[fullVectors['p'] == row['relation']]), axis=1)
    
    #print(relation_distances)
    
    
    ###2nd approach
#     print(f"Relation Vectors:\n{cRelationVectors}")
    
#     row = cRelationVectors.iloc[0]
#     print(f"Row: {row}")
#     print(f"Row.vec:\n{row['vec']}")
#     print(f"r_vecs:\n{list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec'])}")
#     print(f"Result:\n{wv.cosine_similarities(row['vec'], list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec']))}")
    
    #cRelationVectors['mean_dist'] = cRelationVectors.apply(lambda row: np.mean(wv.cosine_similarities(row['vec'], list(cFullVectors[cFullVectors['p'] == row['relation']]['r_vec']))), axis=1)
    
    weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
    relAmbig = np.average(relationVectors['mean_dist'], weights=weights)
    
    #####for nodes (improved)#####
    
    #for node in nodeVectors:
        #node_estimate = average weighted by relations dist to mean(all connected nodes + their relation to node)
    #ambiguity = average weighted by count of (distance(node_estimate, node))
    #=> ambiguity values from -1..1 where 1 is the least ambiguous
    
    weights = nodeVectors['total']
    nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)
    
    #bad quality of all nodes
    #-> all relation vectors will have high average dist to mean
    
    #bad quality of all relations
    #-> all relation vectors will have high average dist to mean
    
    #bad quality of some nodes
    #-> node_estimate slightly wrong but better if many nodes
    
    #bad quality of some relations
    #-> node_estimate very slightly wrong but better if many nodes and high relation dist to mean
    
    #perfect quality
    #-> node_estimate = node and ambiguity = 1
    
    #do strongly connected nodes influence the outcome more? (they should)
    #-> yes, they are included in more node_estimates
    
    #node and relation are included in each others calculations equally (=once) and only their means are used?
    #-> yes
    
    
    ##final composition & transformation
    #mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
    totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])
    
    #transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
    return 1-((1+totAmbig)/2)

In [None]:
data = []
data += [['tokyo', wv['berlin'], 1]]
data += [['tokyo', wv['tokyo'], 2]]
data += [['vienna', wv['vienna'], 3]]
df2 = pd.DataFrame(data, columns=['node', 'vec', 'num'])

df2.head()

In [None]:
data = []
data += [['tokyo', wv['berlin'], 1]]
data += [['tokyo', wv['tokyo'], 2]]
data += [['vienna', wv['vienna'], 3]]
df2 = pd.DataFrame(data, columns=['node', 'vec', 'num'])


df2[['vec2', 'vec3']] = df2.apply(lambda row: pd.Series([toVector(row['node']), None]), axis=1)

df2.head()

In [None]:
#Test mean
df2out = df2.groupby('node').apply(np.mean).reset_index()
df2out['vec'] = df2.groupby('node')['vec'].apply(np.mean).reset_index()['vec']
df2out.head()

In [None]:
#NaN vs None problems
print(f"Python: {float('NaN') is None}")
print(f"Numpy equal(..): {np.equal(float('NaN'), None)}")
print(f"Numpy isnan(..): {np.isnan(float('NaN'))}")
print(f"Pandas isnan(..): {pd.isnull(float('NaN'))}, {pd.isnull(None)}") #Replace python checks with this