## Imports & logging

In [None]:
import gensim
import gensim.downloader as api
import logging
from rdflib import Graph, Namespace, Literal
from rdflib.plugins.parsers import trig
import tempfile
import numpy as np
import pandas as pd

from rdflib import URIRef
from rdflib.namespace import RDF
from rdflib.namespace import RDFS
from rdflib.namespace import SKOS
from tqdm import tqdm
from TqdmToLogger import TqdmToLogger
import matplotlib
import matplotlib.pyplot as plt
import json

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

tqdm.pandas()
tqdm_out = TqdmToLogger(logger, level=logging.INFO)

np.random.seed(0)

In [None]:
logger.setLevel(logging.INFO)
#logger.setLevel(logging.DEBUG)

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
#pd.reset_option('max_columns')
#pd.set_option('max_colwidth', None)

## Load dictionary

In [None]:
gn = api.load('word2vec-google-news-300')

In [None]:
gw1 = api.load("glove-wiki-gigaword-100")

In [None]:
wv = gw1

In [None]:
gw3 = api.load("glove-wiki-gigaword-300")

## Load graph

### Own KG

In [None]:
tg = Graph()
tg.parse("KGDemo.ttl", format="ttl")
logger.info(f"Loaded graph with {len(tg)} triples")

In [None]:
g = tg

### sparql

In [None]:
sg = Graph()
sg.parse("sparql", format="ttl")
logger.info(f"Loaded graph with {len(sg)} triples")

In [None]:
g = sg

## Helper functions

In [None]:
def getPreferredTitle(n, lang="en"):
    label = g.preferredLabel(n, lang=lang)

    #if type(n) is not type(Literal("")): #only labels should be of type literal
    if label == []:
        return n.rsplit('/', 1)[-1].replace('_', ' ').replace(',', '').lower() #TODO: replace "()"?
    else:
        return label[0][1].value.lower()
    #else:
    #    return None

#Returns [vec, isMultipart, multipart-matched-%]
def toVector(n):
    title = getPreferredTitle(n)
    #if title is None:
    #    return [None, None, None]
    
    #In case of multiple words in title use mean of individual vectors
    if " " in title:
        subvecs = []
        count = 0
        hit = 0
        for word in title.split(" "):
            count += 1
            try:
                subvecs += [wv[word]]
                hit += 1
            except KeyError:
                pass
        
        if hit > 0:
            return [sum(subvecs)/hit, True, hit/count]
        else:
            return [None, True, 0]
    else:
        try:
            return [wv[title], False, False]
        except KeyError:
            return [None, False, False]

#Methods to ambiguify nodes and relations
def select(inp, obj={'perc': None, 'num': None}):
    if len(obj) > 1:
        raise TypeError("Please give exactly one of percentage or number")
        
    for val in obj:
        if val == 'perc':
            return inp.sample(frac=obj[val], replace=True)
        else:
            return inp.sample(n=obj[val], replace=True)

def ambiguify(config, nodeVectors, relationVectors):
    out = pd.DataFrame()
    for target in config: #can be "nodes" or "relations"
        for method in config[target]: #matches the name of the method
            for instance in config[target][method]: #once for every instance of the method config
                for val in instance['amount']: #the amount of elements to be changed
                    logger.info(f"Ambiguifying {target} with {method} (parameters: {instance})")
                    if target == 'nodes':
                        inp = nodeVectors
                    else:
                        inp = relationVectors
                    
                    selres = select(inp, instance['amount'])
                    conf = instance.get('param', None)
                    sourceColumn = target[0:-1]
                    rep = pd.DataFrame()
                    
                    rep[['method', 'config', 'source_type', 'source', 'target']] = selres.progress_apply(lambda sel: pd.Series([
                        method,
                        str(instance),
                        sourceColumn,
                        sel[sourceColumn],
                        methods[target][method](sel.copy(), inp.copy(), conf)[sourceColumn].iloc[0]
                    ]), axis=1)
                    
                    out = out.append(rep, ignore_index=True)
    return out

#Modify triple and save as new
def modTriple(row, g2, useObject = False, retry=False):
    if row['source_type'] == 'relation':
        logger.debug("Replacing r")
        fil = fullVectors['p'] == row['source']
    else:
        if useObject:
            logger.debug("Replacing o")
            fil = fullVectors['o'] == row['source']
        else:
            logger.debug("Replacing s")
            fil = fullVectors['s'] == row['source']
    
    res = fullVectors[fil]

    if len(res) == 0:
        if not retry:
            return modTriple(row, g2, useObject= not useObject, retry=True)
        else:
            logger.error(f"Could not find original triple!")
    else:
        rep = res.sample(n=1)

        #add row to graph with changed content
        if row['source_type'] == 'relation':
            g2.add((rep['s'].iloc[0], row['target'], rep['o'].iloc[0]))
            logger.debug(f"{(rep['s'].iloc[0], row['target'], rep['o'].iloc[0])}")
            return (rep['s'].iloc[0], row['target'], rep['o'].iloc[0], rep['s'].iloc[0], rep['p'].iloc[0], rep['o'].iloc[0])
        else:
            if useObject:
                g2.add((rep['s'].iloc[0], rep['p'].iloc[0], row['target']))
                logger.debug(f"{(rep['s'].iloc[0], rep['p'].iloc[0], row['target'])}")
                return (rep['s'].iloc[0], rep['p'].iloc[0], row['target'], rep['s'].iloc[0], rep['p'].iloc[0], rep['o'].iloc[0])
            else:
                g2.add((row['target'], rep['p'].iloc[0], rep['o'].iloc[0]))
                logger.debug(f"{(row['target'], rep['p'].iloc[0], rep['o'].iloc[0])}")
                return (row['target'], rep['p'].iloc[0], rep['o'].iloc[0], rep['s'].iloc[0], rep['p'].iloc[0], rep['o'].iloc[0])
        
def populateAdditions(res, g2):
    logger.info(f"Populatig graph")
    
    out = pd.DataFrame()
    out[['s', 'p', 'o', 's_orig', 'p_orig', 'o_orig']] = res.progress_apply(lambda row: pd.Series(modTriple(row, g2, useObject=(np.random.random() >= 0.5))), axis=1)
    
    res[['s_orig', 'p_orig', 'o_orig']] = out[['s_orig', 'p_orig', 'o_orig']]
    out = out.drop(columns={'s_orig', 'p_orig', 'o_orig'})
    
    return out

#Returns percent of ambiguity in the graph
def calculateAmbiguity(fullVectors, nodeVectors, relationVectors):
    #for relations
    weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
    relAmbig = np.average(relationVectors['mean_dist'], weights=weights)
    
    #for nodes
    weights = nodeVectors['total']
    nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)
    
    #mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
    totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])
    
    #transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
    return 1-((1+totAmbig)/2)

#Calculates the distance-severity of ambiguities
def getDistSeverity(sourceType, source, target):
    if sourceType == 'node':
        inp = nodeVectors.copy()
    else:
        inp = relationVectors.copy()
    
    sourceEntry = inp[inp[sourceType] == URIRef(source)]

    inp['dist'] = inp.apply(lambda row: wv.cosine_similarities(list(sourceEntry['vec'])[0], [list(row['vec'])])[0], axis = 1)
    inp = inp.sort_values(by=['dist'], ascending=False).reset_index(drop=True)

    targetEntry = inp[inp[sourceType] == URIRef(target)]
        
    return targetEntry.index[0]

#Calculates the closeness-severity of ambiguities
def getClosenessSeverity(sourceType, source, target):
    #print("\n\n\n")
    if sourceType == 'node':
        inp = nodeVectors.copy()
        #print("inp is node")
    else:
        inp = relationVectors.copy()
        #print("inp is rel")
    
    sourceEntry = inp[inp[sourceType] == URIRef(source)]

    inp['dist'] = inp.apply(lambda row: wv.cosine_similarities(list(sourceEntry['vec'])[0], [list(row['vec'])])[0], axis = 1)
    inp = inp.sort_values(by=['dist'], ascending=False).reset_index(drop=True)
    
    targetEntry = inp[inp[sourceType] == URIRef(target)]
    
    if len(targetEntry) == 0:
        logger.warning("Defect entry due to library bug, skipping with closeness value None")
        return None

    return np.absolute([targetEntry['dist'].iloc[0]])[0]

## Methods

In [None]:
##### Setup methods
def rand(inp, source, conf):
    return source.sample(n=1)

#Find result with specific distance
def dist(inp, source, conf):
    if conf == None:
        dist = 1
    else:
        dist = conf.get('dist', 1)
        
    dist = min(max(dist, 0), len(source.index)-1)
    
    #use pandas to get top-n, if dist is same move inp to the top
    source[['dist', 'isInp']] = source.apply(lambda row: pd.Series([wv.cosine_similarities(list(inp['vec']), [list(row['vec'])])[0], inp[0] == row[0]]), axis = 1)
    source = source.sort_values(by=['dist', 'isInp'], ascending=False)
    
    logger.debug(f"source:\n{source}")
    
    logger.debug(f"dist: {dist}")
    logger.debug(f"choice:\n{source.iloc[[dist]]}")

    return source.iloc[[dist]].drop(['dist', 'isInp'], axis=1)

#Find result with specific closeness
def closeness(inp, source, conf):
    if conf == None:
        closeness = 1
    else:
        closeness = conf.get('closeness', 1)
        
    closeness = min(max(closeness, 0), 2) #0 is equal to the input, 2 is its inverse
    
    #use pandas to get dists
    source[['dist', 'isInp']] = source.apply(lambda row: pd.Series([wv.cosine_similarities(list(inp['vec']), [list(row['vec'])])[0], inp[0] == row[0]]), axis = 1)
    resIndex = source['dist'].add(closeness-1).abs().idxmin()
    
    return source.iloc[[resIndex]].drop(['dist', 'isInp'], axis=1)

#Find result closest to inverse input vector
def negative(inp, source, conf):
    return closeness(inp, source, {'closeness': 2})

methods = {
    'nodes':{
        'random': rand,
        'dist': dist,
        'closeness': closeness,
        'negative': negative
    },
    'relations': {
        'random': rand,
        'dist': dist,
        'closeness': closeness,
        'negative': negative
    }
}

## Vectorisation Functions

In [None]:
def convertGraph(g):
    logger.info(f"Converting graph")
    length = 0
    for s, p, o in g.triples((None, None, None)):
        length += 1

    fullVectors = []
    for s, p, o in tqdm(g.triples((None, None, None)), total=length, file=tqdm_out, mininterval=2):
        fullVectors += [[s, p, o]]

    return pd.DataFrame(data=fullVectors, columns=['s', 'p', 'o'])

def vectorifyGraph(fullVectors):
    logger.info(f"Vecotrifying graph")

    logger.info('Subject vectors')
    fullVectors[['s_vec', 's_is_multipart', 's_multipart_%']] = fullVectors.progress_apply(lambda row: pd.Series(toVector(row['s'])), axis=1)

    logger.info('Object vectors')
    fullVectors[['o_vec', 'o_is_multipart', 'o_multipart_%']] = fullVectors.progress_apply(lambda row: pd.Series(toVector(row['o'])), axis=1)

    logger.info('Relation vectors')
    fullVectors[['r_vec', 'is_zero_vector_relation']] = fullVectors.progress_apply(lambda row: pd.Series([
        row['o_vec']-row['s_vec'] if row['o_vec'] is not None and row['s_vec'] is not None else None,
        np.array_equal(row['o_vec']-row['s_vec'], [0]*len(row['o_vec'])) if row['o_vec'] is not None and row['s_vec'] is not None else False
    ]), axis=1)
    
    return fullVectors

def generateRelationVectors(fullVectors):
    logger.info(f"Generating relationVectors")
    
    logger.debug(f"Grouping")
    relationVectors = fullVectors.groupby('p')['r_vec'].apply(np.mean).reset_index().rename(columns={'p': 'relation', 'r_vec': 'vec'})

    logger.debug(f"Calculating total, lost, zero_vector and quality")
    relationVectors['total'] = fullVectors.groupby('p')['p'].count().reset_index(drop=True)
    relationVectors['lost'] = fullVectors.groupby('p')['r_vec'].apply(lambda x: x.isnull().sum()).reset_index(drop=True)
    relationVectors['zero_vector'] = fullVectors.groupby('p')['is_zero_vector_relation'].sum().astype(int).reset_index(drop=True)
    relationVectors['quality'] = relationVectors.apply(lambda row: 1-(row['lost']+row['zero_vector'])/row['total'], axis=1)

    #TODO: labels are counted as lost nodes
    #Split into relationVectors and lostRelations
    logger.debug(f"Splitting into relationVectors and lostRelations")
    lostRelations = relationVectors[np.bitwise_or(
        relationVectors['vec'].isnull(),
        relationVectors['total']-relationVectors['lost']-relationVectors['zero_vector'] == 0
    )].reset_index(drop=True).drop(columns=['vec', 'quality'])
    relationVectors = relationVectors[np.bitwise_and(
        relationVectors['vec'].notnull(),
        relationVectors['total']-relationVectors['lost']-relationVectors['zero_vector'] != 0
    )].reset_index(drop=True)
    
    #Min/max/average distance of every full vector of this relation type to mean vector
    logger.debug(f"Calculating min, max and average distances")
    def helper_dist(row):
        vectors = fullVectors[fullVectors['p'] == row['relation']]

        #filter out None and zero-vector
        vectors = vectors[vectors['is_zero_vector_relation'] == False]
        vectors = vectors['r_vec'].dropna().reset_index(drop=True)

        sims = wv.cosine_similarities(row['vec'], list(vectors))
        return [np.min(sims), np.max(sims), np.mean(sims)]

    relationVectors[['min_dist', 'max_dist', 'mean_dist']] = relationVectors.apply(lambda row: pd.Series(helper_dist(row)), axis=1)
    logger.info("Done")
    
    return relationVectors, lostRelations

def calculateNodeEstimates(fullVectors, relationVectors):
    logger.info(f"Calculating node estimates")
    
    def helper(row, op, same, other):
        #Select the relation vector if there is one
        relVecs = relationVectors[relationVectors['relation'] == row['p']]
        if len(relVecs) > 0:
            rVec = relVecs['vec'].iloc[0]
        else:
            rVec = None
        
        #Calculate the estimate
        if row[other] is not None and rVec is not None:
            est = op(row[other], rVec)
        else:
            est = None
        
        #Calculate the distance
        if est is not None and row[same] is not None:
            dist = wv.cosine_similarities(row[same], [est])[0]
        else:
            dist = None
        
        return pd.Series([est, dist], dtype='object')

    logger.info(f"Subject estimates")
    fullVectors[['s_est', 's_est_dist']] = fullVectors.progress_apply(helper, args=[np.subtract, 's_vec', 'o_vec'], axis=1)
    
    logger.info(f"Object estimates")
    fullVectors[['o_est', 'o_est_dist']] = fullVectors.progress_apply(helper, args=[np.add, 'o_vec', 's_vec'], axis=1)
    
    #Workaround for readability as pandas is equaling NaN and None
    fullVectors = fullVectors.astype({'s_est': 'object', 's_est_dist': 'object', 'o_est': 'object', 'o_est_dist': 'object'})
    fullVectors.loc[fullVectors['s_est'].isna(), 's_est'] = None
    fullVectors.loc[fullVectors['s_est_dist'].isna(), 's_est_dist'] = None
    fullVectors.loc[fullVectors['o_est'].isna(), 'o_est'] = None
    fullVectors.loc[fullVectors['o_est_dist'].isna(), 'o_est_dist'] = None
    
    return fullVectors

def generateNodeVectors(fullVectors):
    logger.info(f"Generating nodeVectors")
    
    #Rename and merge
    logger.debug(f"Renaming and merging")
    subjectVectors = fullVectors[['s', 's_vec', 's_is_multipart', 's_multipart_%', 's_est', 's_est_dist']].rename(columns={'s': 'node',
                                                                                                    's_vec': 'vec',
                                                                                                    's_is_multipart': 'is_multipart',
                                                                                                    's_multipart_%': 'multipart_%',
                                                                                                    's_est': 'est',
                                                                                                    's_est_dist': 'est_dist'})
    objectVectors = fullVectors[['o', 'o_vec', 'o_is_multipart', 'o_multipart_%', 'o_est', 'o_est_dist']].rename(columns={'o': 'node',
                                                                                                   'o_vec': 'vec',
                                                                                                   'o_is_multipart': 'is_multipart',
                                                                                                   'o_multipart_%': 'multipart_%',
                                                                                                   'o_est': 'est',
                                                                                                   'o_est_dist': 'est_dist'})
    nodeVectors = pd.concat([subjectVectors, objectVectors], ignore_index=True)
    
    #Remove duplicates
    logger.debug(f"Grouping")
    nodeGroup = nodeVectors.groupby('node')

    logger.debug(f"Using first for vector")
    #nodeVectors = nodeGroup.first().reset_index() #this is really slow
    nodeVectors = nodeGroup.head(1).reset_index(drop=True)
    
    logger.debug(f"Calculating totals")
    nodeVectors['total'] = nodeGroup.size().reset_index(drop=True)
    
    logger.debug(f"Calculating estimates")
    nodeVectors['est'] = nodeGroup['est'].apply(np.mean).reset_index(drop=True)
    
    #Workaround as pandas is equaling NaN and None
    nodeVectors = nodeVectors.astype({'est': 'object', 'vec': 'object'})
    nodeVectors.loc[nodeVectors['est'].isna(), 'est'] = None
    nodeVectors.loc[nodeVectors['vec'].isna(), 'vec'] = None
    
    logger.debug(f"Calculating estimate distances")
    nodeVectors['est_dist'] = nodeVectors.apply(lambda row: wv.cosine_similarities(list(row['est']), [list(row['vec'])])[0] if row['est'] is not None and row['vec'] is not None else None, axis=1)
    
    logger.debug(f"Calculating mean/min/max of distances")
    nodeVectors['mean_est_dist'] = nodeGroup['est_dist'].apply(np.mean).reset_index(drop=True)
    nodeVectors['min_est_dist'] = nodeGroup['est_dist'].apply(np.min).reset_index(drop=True)
    nodeVectors['max_est_dist'] = nodeGroup['est_dist'].apply(np.max).reset_index(drop=True)

    #Split into nodeVectors and lostNodes
    logger.debug(f"Splitting into nodeVectors and lostNodes")
    lostNodes = nodeVectors[nodeVectors['vec'].isnull()].reset_index(drop=True).drop(columns=['vec', 'est_dist', 'mean_est_dist', 'min_est_dist', 'max_est_dist'])
    
    nodeVectors = nodeVectors.dropna().reset_index(drop=True)
    logger.info("Done")

    return nodeVectors, lostNodes

## Vector generation

In [None]:
convertedGraph = convertGraph(g)

In [None]:
fullVectors = vectorifyGraph(convertedGraph)

In [None]:
relationVectors, lostRelations = generateRelationVectors(fullVectors)

In [None]:
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)

In [None]:
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

## Save & load dataframes

## View calculated values

### Full vectors

In [None]:
print(f"Length: {len(fullVectors)}")
fullVectors.head(20)

In [None]:
#Show rows where the relation vector was lost
fullVectors[fullVectors['r_vec'].isnull()].head(2)

In [None]:
#Show rows where the relation vector is a zero-vector
fullVectors[fullVectors['is_zero_vector_relation'] == True].head(2)

### Node Vectors

In [None]:
print(f"Length: {len(nodeVectors)}")
nodeVectors.head(20)

In [None]:
print(f"Length: {len(lostNodes)}")
lostNodes.head()

### Relation Vectors

In [None]:
print(f"Length: {len(relationVectors)}")
relationVectors.head(37)

In [None]:
print(f"Length: {len(lostRelations)}")
lostRelations.head(3)

## Manual calculations for verification

### Full vectors

In [None]:
#Multipart
print(nodeVectors['is_multipart'].iloc[20])
print(nodeVectors['multipart_%'].iloc[20])

In [None]:
#Zero-Vector
print(fullVectors['is_zero_vector_relation'].iloc[20])

In [None]:
#Lost relation
print(fullVectors['r_vec'].iloc[18])

In [None]:
#Estimate
print(fullVectors['s_est'].iloc[18], fullVectors['s_est_dist'].iloc[18])

#print(fullVectors['s_est'].iloc[0])

#Calculate estimate (minus means arrow from R to L for relationVector calculation)
est = fullVectors['o_vec'].iloc[0]-relationVectors['vec'].iloc[2]
#print(est)

print(wv.cosine_similarities(fullVectors['s_est'].iloc[0], [est])[0])

print(fullVectors['s_est_dist'].iloc[0])
print(wv.cosine_similarities(fullVectors['s_vec'].iloc[0], [est])[0])

### Relations

In [None]:
#Lost relation
print(lostRelations['lost'].iloc[0])
print(lostRelations['zero_vector'].iloc[1])

In [None]:
#Quality
print(relationVectors['quality'].iloc[1])
print((relationVectors['total'].iloc[1]-relationVectors['lost'].iloc[1]-relationVectors['zero_vector'].iloc[1])/relationVectors['total'].iloc[1])

### Nodes

In [None]:
#Unlabeled
np.array_equal(fullVectors['o_vec'].iloc[32], wv['australia'])

In [None]:
#Lost node
lostNodes.iloc[0]

### Methods

In [None]:
#Node with the maximal distance to 'berlin'
nodeVectors.iloc[np.argmin(wv.cosine_similarities(wv['berlin'], list(nodeVectors['vec'])))]['node']

In [None]:
#Dist method

#Same
print(nodeVectors['node'].iloc[0], dist(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'dist': 0})['node'].iloc[0])

#Inverse
print(nodeVectors['node'].iloc[0], dist(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'dist': 100})['node'].iloc[0])

In [None]:
#Closeness method

#Same
print(nodeVectors['node'].iloc[0], closeness(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'closeness': 0})['node'].iloc[0])

#Inverse
print(nodeVectors['node'].iloc[0], closeness(nodeVectors.iloc[0].copy(), nodeVectors.copy(), {'closeness': 2})['node'].iloc[0])

### Ambiguity

In [None]:
#Ambiguity
print("0.1191733359168643")

#for relations
weights = relationVectors['total'] - relationVectors['lost'] - relationVectors['zero_vector']
relAmbig = np.average(relationVectors['mean_dist'], weights=weights)

print(np.array_equal(weights, [6, 5, 7]))
print(relAmbig)
print((relationVectors['mean_dist'].iloc[0]*6+relationVectors['mean_dist'].iloc[1]*5+relationVectors['mean_dist'].iloc[2]*7)/(6+5+7))

#for nodes
weights = nodeVectors['total']
nodeAmbig = np.average(nodeVectors['est_dist'], weights=weights)

#mean of ambiguities of nodes and vectors, weight by nodes 2:1 relation
totAmbig = np.average([relAmbig, nodeAmbig], weights=[2, 1])

#transform -1..1 where 1 is the least ambiguous to 0..1 where 1 is the most ambiguous
print(1-((1+totAmbig)/2))

print(1-((1+(-1))/2))
print(1-((1+(1))/2))

## Run workflow

### Evaluation config

In [None]:
#Simulate different amounts of ambiguity
config = {
    'nodes': {
        'random': [{'amount': {'num': 5}}],
        'dist': [{
                    'amount': {'num': 5},
                     'param': {'dist': 0}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 2}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 3}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 4}
                 }]
    },
    'relations': {
        'random': [{'amount': {'num': 5}}],
        'dist': [{
                    'amount': {'num': 5},
                     'param': {'dist': 0}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 1}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 2}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 3}
                 }, {
                     'amount': {'num': 5},
                     'param': {'dist': 4}
                 }]
    }
}

### Workflow

In [None]:
#Run imports and define functions, configure the desired result

#Load graph and dictionary, you can skip this if you already ran it
convertedGraph = convertGraph(g)
fullVectors = vectorifyGraph(convertedGraph)
relationVectors, lostRelations = generateRelationVectors(fullVectors)
fullVectors = calculateNodeEstimates(fullVectors, relationVectors)
nodeVectors, lostNodes = generateNodeVectors(fullVectors)

#Check outputs before continuing

In [None]:
#The ambiguify-function returns vectors according to configured methods
np.random.seed(0)
changes = ambiguify(config, nodeVectors, relationVectors)

In [None]:
#Insert new node into graph based on one random triple containing the source
g2 = Graph()
np.random.seed(0)
additions = populateAdditions(changes, g2)

In [None]:
changes.head(100)

In [None]:
additions.head()

In [None]:
#Save additions from nodes and relations
logger.info(f"Saving files")

f = open("additions.ttl", "wb")
f.write(g2.serialize(format='turtle'))
f.close()

#Save graph with additions
g3 = g+g2
f = open("appendedKG.ttl", "wb")
f.write(g3.serialize(format='turtle'))
f.close()

logger.info(f"Done")

In [None]:
#Calculate ambiguity values
#Calculate ambiguity before
ambiguityBefore = calculateAmbiguity(fullVectors, nodeVectors, relationVectors)
logger.info(f"Ambiguity before: {ambiguityBefore}")

#Prepare data to calculate ambiguity after
newFullVectors = fullVectors.copy()

if len(additions) > 0:
    logger.info(f"Adding {len(additions)} additional triples")
    vectorisedAdditions = vectorifyGraph(additions)
    newFullVectors = newFullVectors.append(vectorisedAdditions, ignore_index = True)

newRelationVectors, newLostRelations = generateRelationVectors(newFullVectors)
newFullVectors = calculateNodeEstimates(newFullVectors, newRelationVectors)
newNodeVectors, newLostNodes = generateNodeVectors(newFullVectors)

#Calculate ambiguity after
ambiguityAfter = calculateAmbiguity(newFullVectors, newNodeVectors, newRelationVectors)
logger.info(f"Ambiguity after: {ambiguityAfter}")

logger.info(f"Ambiguity difference: {ambiguityAfter-ambiguityBefore}")

# Evaluation

## Export for AMT

In [None]:
#Generate different severities for same original triple

def ambiguify4amt(config, nodeVectors, relationVectors):
    out = pd.DataFrame()
    for target in config: #can be "nodes" or "relations"
        if target == 'nodes':
            inp = nodeVectors
            #selres = nodeSel
        else:
            inp = relationVectors
            #selres = relSel
            
        selres = inp.sample(n=5, replace=True) # hardcoded for evaluation
        #print(selres.head(100))
        
        for method in config[target]: #matches the name of the method
            for instance in config[target][method]: #once for every instance of the method config

                logger.debug(f"Selres: {selres}")
                
                for val in instance['amount']: #the amount of elements to be changed
                    logger.info(f"Ambiguifying {target} with {method} (parameters: {instance})")
                    
                    conf = instance.get('param', None)
                    sourceColumn = target[0:-1]
                    rep = pd.DataFrame()
                    
                    rep[['method', 'config', 'source_type', 'source', 'target']] = selres.progress_apply(lambda sel: pd.Series([
                        method,
                        str(instance),
                        sourceColumn,
                        sel[sourceColumn],
                        methods[target][method](sel.copy(), inp.copy(), conf)[sourceColumn].iloc[0]
                    ]), axis=1)
                    
                    out = out.append(rep, ignore_index=True)
    return out

In [None]:
np.random.seed(0)
changes = ambiguify4amt(config, nodeVectors, relationVectors)

g2 = Graph()
np.random.seed(0)
additions = populateAdditions(changes, g2)

In [None]:
#Change format
amt = pd.concat([changes, additions], axis=1)

In [None]:
#Compute names
def getPreferredTitle4amt(n, lang="en"):
    label = g.preferredLabel(n, lang=lang)

    #if type(n) is not type(Literal("")): #only labels should be of type literal
    if label == []:
        return n.rsplit('/', 1)[-1].replace('_', ' ').replace(',', '')
    else:
        return label[0][1].value


amt['s_orig_name'] = amt['s_orig'].apply(getPreferredTitle4amt)
amt['p_orig_name'] = amt['p_orig'].apply(getPreferredTitle4amt)
amt['o_orig_name'] = amt['o_orig'].apply(getPreferredTitle4amt)

amt['s_name'] = amt['s'].apply(getPreferredTitle4amt)
amt['p_name'] = amt['p'].apply(getPreferredTitle4amt)
amt['o_name'] = amt['o'].apply(getPreferredTitle4amt)

#Compute verification code
def ver4amt(row):
    num = np.array([
        np.random.randint(np.min([len(row['s_name']), 5])-1),
        np.random.randint(np.min([len(row['p_name']), 5])-1),
        np.random.randint(np.min([len(row['o_name']), 5])-1)
    ])
    
    code = (row['s_name'][num[0]]+row['p_name'][num[1]]+row['o_name'][num[2]]).replace(' ', '_')

    #print(row[['s_name', 'p_name', 'o_name']], num+1, code)
    return ('-'.join(str(x) for x in (num+1)), code)

np.random.seed(0)
amt[['ver_num', 'ver_code']] = amt.apply(lambda row: pd.Series(ver4amt(row)), axis=1)


In [None]:
amt.head(100)

In [None]:
changes.to_pickle("changes.plk")
additions.to_pickle("additions.plk")
amt.to_pickle("amt.plk")

In [None]:
amt.to_csv("amt.csv", index=False)

## Process AMT results

In [None]:
#amtres = pd.read_csv("Batch_305165_batch_results.csv") #pilot 1
#amtres = pd.read_csv("Batch_305480_batch_results.csv") #pilot 2
amtres = pd.read_csv("Batch_4310201_batch_results.csv") #paid AMT questionnaire

In [None]:
#Extract and format data for plots

amtres['severity'] = amtres.apply(lambda row: getDistSeverity(row['Input.source_type'], row['Input.source'], row['Input.target'])
                                  if row['Input.method'] == 'random'
                                  else (json.loads(row['Input.config'].replace("'", '"'))['param']['dist']
                                        if row['Input.method'] == 'dist'
                                        else -1)
                                  , axis = 1)

#Note: this data may not be available in the pilots
amtres['Answer.mistakeSource'] = amtres.apply(lambda row: 'artificial'
                                  if row['Answer.mistakeSource.artificial'] == True
                                  else ( 'human'
                                        if row['Answer.mistakeSource.human'] == True
                                        else 'correct')
                                  , axis = 1)

amtres['closeness'] = amtres.apply(lambda row: getClosenessSeverity(row['Input.source_type'], row['Input.source'], row['Input.target']), axis = 1)

#data = amtres[['Answer.stars', 'WorkTimeInSeconds', 'severity']]#.sort_values(['Answer.stars', 'WorkTimeInSeconds'])

In [None]:
#Set accept/reject for amt answers and export

#Reject tasktime outliers
Q1 = amtres.quantile(0.25)
Q3 = amtres.quantile(0.75)
IQR = Q3 - Q1
quant = (amtres < (Q1 - 3 * IQR)) | (amtres > (Q3 + 3 * IQR))

amtres['Reject'] = amtres.apply(lambda row: 'Irregular worktime' if quant['WorkTimeInSeconds'].iloc[row.name] else row['Reject'], axis = 1)

#Reject everything with wrong codes
amtres['Reject'] = amtres.apply(lambda row: 'Wrong code' if row['Answer.code'] != row['Input.ver_code'] else row['Reject'], axis = 1)

#Reject obviously wrong data
#Severity 0 but not 5 stars or source not 'correct'
amtres['Reject'] = amtres.apply(lambda row: 'Wrong answer(s)' if row['severity'] == 0 and (row['Answer.stars'] != 5 or row['Answer.mistakeSource'] != 'correct') else row['Reject'], axis = 1)


#Accept everything that wasn't rejected
amtres['Approve'] = amtres.apply(lambda row: pd.isnull(row['Reject']), axis = 1)


#Export file
amtres.to_csv("amtres.csv", index=False)

In [None]:
print(f"Length: {len(amtres)}")
amtres.head(100)

## Plot graphs

In [None]:
#Set global plot size
plt.rcParams["figure.figsize"] = [10, 8]

In [None]:
#Total answers per rating
amtres[amtres['Approve']][['Answer.stars', 'WorkerId']].groupby('Answer.stars').count().reset_index().rename(columns={'Answer.stars': 'rating', 'WorkerId': 'count'}).head(10)

In [None]:
#Answers per rating
data = amtres[amtres['Approve']].copy()
_, ax = plt.subplots()

plt.plot(data[['Answer.stars', 'WorkerId', 'severity']].groupby(['Answer.stars', 'WorkerId']).count().groupby(['Answer.stars']).mean(),
        label='mean',
        linewidth=3.0)

for n, g in data.groupby(['WorkerId']):
    g = g.sort_values(['Answer.stars'])
    plt.plot(g[['Answer.stars', 'WorkerId']].groupby('Answer.stars').count(),
             marker='x',
             label=g['WorkerId'].iloc[0],
             linestyle='dashed'
    )
    
plt.title('Answers per rating')
plt.xlabel('Rating')
plt.ylabel('Number of votes')
ax.legend()

plt.savefig('figures/answersPerRating.png')

In [None]:
#Scatterplot of severity per rating
data = amtres[amtres['Approve']].copy()
_, ax = plt.subplots()

data['severity'] = data['severity'] + 1 #prevent missing data due to log(0)

for n, g in data.groupby(['Input.method', 'Input.source_type']):
    g = g.sort_values(['Answer.stars'])
    plt.scatter(g['Answer.stars'], g['severity'],
    marker=(
        'x' if g['Input.method'].iloc[0] == 'random' else 'o'
    ), label=(
        (
            'random node' if g['Input.source_type'].iloc[0] == 'node' else 'random relation'
        ) if g['Input.method'].iloc[0] == 'random' else (
            'dist node' if g['Input.source_type'].iloc[0] == 'node' else 'dist relation'
        )
    ))

plt.title('Scatterplot of severity per rating')
plt.xlabel('Rating')
plt.ylabel('Severity')
plt.yscale('log')
ax.legend()

plt.savefig('figures/scatterSeverityPerRating.png')

In [None]:
#Unfiltered count per method and source
amtres[['Answer.mistakeSource', 'WorkerId', 'Input.method']].groupby(['Input.method', 'Answer.mistakeSource']).count().reset_index().rename(columns={'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource', 'Input.method': 'method'}).head(30)

In [None]:
#Count per method and source
amtres[amtres['Approve']][['Answer.mistakeSource', 'WorkerId', 'Input.method']].groupby(['Input.method', 'Answer.mistakeSource']).count().reset_index().rename(columns={'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource', 'Input.method': 'method'}).head(30)

In [None]:
#Count per rating and source
amtres[amtres['Approve']][['Answer.stars', 'Answer.mistakeSource', 'WorkerId']].groupby(['Answer.stars', 'Answer.mistakeSource']).count().reset_index().rename(columns={'Answer.stars': 'rating', 'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource'}).head(30)

In [None]:
#Source distribution by rating
data = amtres[amtres['Approve']].copy()

sources = list(data[['Answer.mistakeSource']].groupby(['Answer.mistakeSource']).first().index)

data = data[['Answer.stars', 'Answer.mistakeSource', 'WorkerId']].groupby(['Answer.stars', 'Answer.mistakeSource']).count().reset_index().rename(columns={'WorkerId': 'count'})
#.reset_index()
#.rename(columns={'WorkerId': 'count'})

for source in sources:
    #print(data[data['Answer.mistakeSource'] == source])
    #print(source)
    data[source] = data[data['Answer.mistakeSource'] == source]['count']



data = data[['Answer.stars', *sources]].groupby(['Answer.stars']).sum().reset_index()
#data['correct'] = data.apply(lambda row: 0, axis=1) #TODO: only if source is missing completely

data.plot( 
  x = 'Answer.stars',  
  kind = 'barh',  
  stacked = True,  
  title = 'Percentage Stacked Bar Graph',  
  mark_right = True) 
  
df_total = data["artificial"] + data["human"] + data["correct"] 
df_rel = data[data.columns[1:]].div(df_total, 0) * 100

for n in df_rel: 
    for i, (cs, ab, pc) in enumerate(zip(data.iloc[:, 1:].cumsum(1)[n], data[n], df_rel[n])): 
        plt.text(cs - ab / 2, i, str(np.round(pc, 1)) + '%', va = 'center', ha = 'center', rotation = 70, fontsize = 8)
        
plt.savefig('figures/hbarSourceDistributionByRating.png')

In [None]:
#Count per severity and source
amtres[amtres['Approve']][['severity', 'Answer.mistakeSource', 'WorkerId']].groupby(['severity', 'Answer.mistakeSource']).count().reset_index().rename(columns={'Answer.stars': 'rating', 'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource'}).head(50)

In [None]:
#Count of wrong/correct per worker
amtCodeCheck = pd.DataFrame()

amtCodeCheck[['WorkerId', 'incorrect']] = amtres[amtres['Answer.code'] != amtres['Input.ver_code']][['WorkerId', 'Answer.stars']].groupby('WorkerId').count().reset_index()
amtCodeCheck[['WorkerId', 'correct']] = amtres[amtres['Answer.code'] == amtres['Input.ver_code']][['WorkerId', 'Answer.stars']].groupby('WorkerId').count().reset_index()

amtCodeCheck.head(10) #.apply(pd.to_numeric, downcast='integer', errors='ignore', axis=1)

In [None]:
#Unfiltered correlation overview
amtres[['Answer.stars', 'WorkTimeInSeconds', 'severity', 'closeness']].rename(columns={'Answer.stars': 'rating', 'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource'}).corr()

In [None]:
#Correlation overview
amtres[amtres['Approve']][['Answer.stars', 'WorkTimeInSeconds', 'severity', 'closeness']].rename(columns={'Answer.stars': 'rating', 'WorkerId': 'count', 'Answer.mistakeSource': 'mistakeSource'}).corr()

In [None]:
#Coorelation per method per type 
amtres[amtres['Approve']][['Answer.stars', 'WorkTimeInSeconds', 'severity', 'closeness', 'Input.method', 'Input.source_type']].rename(columns={'Answer.stars': 'rating', 'Input.method': 'method', 'Input.source_type': 'elementType'}).groupby(['method', 'elementType']).corr()

In [None]:
#Boxplot of severity per rating
data = amtres[amtres['Approve']].copy()
_, ax = plt.subplots()


dat = []
lab = []
for n, g in data.groupby(['Answer.stars']):
    dat += [g['severity']]
    lab += [g['Answer.stars'].iloc[0]]

plt.boxplot(dat, labels=lab)


plt.title('Boxplot of severity per rating')
plt.xlabel('Rating')
plt.ylabel('Severity')
plt.yscale('log')
#ax.legend()

plt.savefig('figures/boxplotSeverityPerRating.png')

# Other stuff

## Check dictionary quality

In [None]:
#Compare all dictionaries
dicts = {"gn": gn, "gw1": gw1, "gw3": gw3}

for i in dicts:
    dic = dicts[i]
    
    #Minus means arrow from R to L
    hasCapital = dic["tokyo"] - dic["japan"]
    isCapitalOf = dic["japan"] - dic["tokyo"]
    
    #Calculate results of relation
    est_france = dic["paris"] + isCapitalOf
    est_paris = dic["france"] + hasCapital
    
    print(f"Dictionary: {i}")
    print(f"est_France to France: {dic.cosine_similarities(est_france, [dic['france']])}")
    print(f"est_Paris to Paris: {dic.cosine_similarities(est_paris, [dic['paris']])}")
    print(f"est_France to Paris: {dic.cosine_similarities(est_france, [dic['paris']])}")
    print(f"est_Paris to France: {dic.cosine_similarities(est_paris, [dic['france']])}")
    print(f"Paris to France: {dic.cosine_similarities(dic['paris'], [dic['france']])}")
    print(f"\n")


## Inversion test

In [None]:
print(f"Similarity: {wv.cosine_similarities(wv['white'], [wv['black']])[0]}\n")

print("Inverted node 'white':")
for d in wv.similar_by_vector(vector = -1*wv['white']):
    print(f"Distance of '{d[0]}' to !white: {d[1]}")