## Link Prediction

First, the Ontology will be loaded in as Graph. Then, with the help of the Jaccard Coefficient the Link per combination of celebrities will be predicted. The most interesting predictions will be kept and put in the T5 model

In [4]:
# Load in the turtle data file with all the Data of the Ontology
import sys
#!{sys.executable} -m pip install rdflib

from rdflib import Graph, ConjunctiveGraph, Literal, BNode, Namespace, RDF, URIRef
from rdflib.namespace import DC, FOAF

import kglab
import networkx as nx
import pandas as pd
import os

os.chdir('/Users/jellewas/Documents/Master_Artificial_Intelligence/KROnTheWeb') # change directory accordingly

data_points = pd.read_csv(os.getcwd() + '/query-result.csv')
G = nx.from_pandas_edgelist(data_points, source="name",target = "occupation",edge_attr = True,create_using=nx.MultiGraph())


<kglab.kglab.KnowledgeGraph at 0x7fbf3cc305e0>

### Jaccard Coefficient
For our Knowledge Graph, we want to predict the Jaccard Coefficient for every possible link.

In [7]:
prediction_jaccard = list(nx.jaccard_coefficient(nx.Graph(G)))
person_string = 'http://dbpedia.org/resource/'
persons_dict = {}
import operator

for (link1, link2, score) in prediction_jaccard:
    if person_string in str(link1):
        if person_string in str(link2):
            persons_dict[(link1, link2)] = score

sorted_person = sorted(persons_dict.items(), key=operator.itemgetter(1))

### Gender Type
For making reliable connections, we also want celebrities to have opposite genders. Therefore we work with a library which can guess the gender of a person based on the first name. 

In [None]:
pip install gender-guesser

In [8]:
import gender_guesser.detector as gender

d = gender.Detector()

linked_1_list = []
linked_2_list = []

for ((person_1, person_2), score) in sorted_person:
   # print(f"This Person {person_1.split('/')[-1]} is connected with this Person {person_2.split('/')[-1]} with {score}")
    person_1 = person_1.split('/')[-1]
    person_1_first_name = person_1.split('_')[0]
    
    person_2 = person_2.split('/')[-1]
    person_2_first_name = person_2.split('_')[0]
    
    person_1_gender = d.get_gender(person_1_first_name)
    person_2_gender = d.get_gender(person_2_first_name)

    male_subset = ['male', 'mostly_male']
    female_subset = ['female', 'mostly_female']
    
    if person_1_gender in male_subset and person_2_gender in female_subset:
        if person_1 not in linked_1_list:
            if person_1 not in linked_2_list:
                if person_2 not in linked_1_list:
                    if person_2 not in linked_2_list:
                        linked_1_list.append(person_1)
                        linked_2_list.append(person_2)
      #  print(f"This Person {person_1} is connected with this Person {person_2} with {score}")
        
        
    elif person_1_gender in female_subset and person_2_gender in male_subset:
        if person_1 not in linked_1_list:
            if person_1 not in linked_2_list:
                if person_2 not in linked_1_list:
                    if person_2 not in linked_2_list:
                        linked_1_list.append(person_1)
                        linked_2_list.append(person_2)

### Pandas DataFrame
Make a Pandas Dataframe from in which all the triples per celebrity that was matched according to the link prediction can be found. This Pandas DataFrame will then be transported to our T5 model.

In [13]:
import re

pandas_dataframe = pd.DataFrame(columns = ['entity 1', 'entity 2', 'connected','Triple Entity 1', 'Triple Entity 2'])
pandas_dataframe['entity 1'] = linked_1_list
pandas_dataframe['entity 2'] = linked_2_list
pandas_dataframe['connected'] = True

dropped_columns = [1]
pandas_dataframe.drop(dropped_columns, axis=0, inplace=True)

rdf_path = os.getcwd() + '/query-result.rdf'

rdf_G = Graph()
rdf_G.parse(rdf_path)

person_string = 'http://dbpedia.org/resource/'
triples_name_1 = {}
triples_name_2 = {}
linked_1_list = pandas_dataframe['entity 1'].to_list()
linked_2_list = pandas_dataframe['entity 2'].to_list()
ex_partner = ['Carlos Leon', 'Dany Garcia', 'Jennifer Syme', 'Irina Shayk'] 

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def check_partner_year(p, o):
    if p == 'hasPartner':
        if has_numbers(o) == True:
            return True
        else:
            return False
    else:
        return False
    
def return_correct_p(string):
    p_split = re.split('(?=[A-Z])', string)
    p_split = p_split[0] + ' ' + p_split[1]
    p_split = p_split.lower()
    
    return p_split
    
delete_list = ['amountWikiLinks', 'lengthPage']

for (s,p,o) in rdf_G.triples((None, None, None)):
    for (name_1, name_2) in zip(linked_1_list, linked_2_list):
        s_split = s.split('/')[-1]
        s_split = s_split.replace('_', ' ')
        
        p_split = p.split('/')[-1]
        p_split = p_split.split('.')[-1]
        p_split = p_split.replace('owl', '')
        
        o_split = o.split('/')[-1]
        o_split = o_split.replace('_', ' ')
        
        if (person_string + name_1)  == str(s): 
            if name_1 in triples_name_1:
                if p_split not in delete_list:
                    if has_numbers(p_split) == False:
                        if type(o_split) == str and 'myonto' not in o_split:
                            if check_partner_year(p_split, o_split) == False:
                                p_split = return_correct_p(p_split)
                                if p_split in ex_partner:
                                    final_string = s_split + ' | ' + 'had partner' + ' | ' + o_split
                                elif p_split not in ex_partner:
                                    final_string = s_split + ' | ' + p_split + ' | ' + o_split
                                triples_name_1[name_1].append((final_string))
            
            elif name_1 not in triples_name_1:
                if p_split not in delete_list:
                    if has_numbers(p_split) == False:
                        if type(o_split) == str and 'myonto' not in o_split:
                            if check_partner_year(p_split, o_split) == False:
                                p_split = return_correct_p(p_split)
                                if p_split in ex_partner:
                                    final_string = s_split + ' | ' + 'had partner' + ' | ' + o_split
                                elif p_split not in ex_partner:
                                    final_string = s_split + ' | ' + p_split + ' | ' + o_split
                                triples_name_1[name_1] = [(final_string)]

        elif (person_string + name_2) == str(s):
            if name_2 in triples_name_2:
                if p_split not in delete_list:
                    if has_numbers(p_split) == False:
                        if type(o_split) == str and 'myonto' not in o_split:
                            if check_partner_year(p_split, o_split) == False:
                                p_split = return_correct_p(p_split)
                                if p_split in ex_partner:
                                    final_string = s_split + ' | ' + 'had partner' + ' | ' + o_split
                                elif p_split not in ex_partner:
                                    final_string = s_split + ' | ' + p_split + ' | ' + o_split
                                triples_name_2[name_2].append((final_string))

            elif name_2 not in triples_name_2:
                if has_numbers(p_split) == False:
                    if p_split not in delete_list:
                        if type(o_split) == str and 'myonto' not in o_split:
                            if check_partner_year(p_split, o_split) == False:
                                p_split = return_correct_p(p_split)
                                if p_split in ex_partner:
                                    final_string = s_split + ' | ' + 'had partner' + ' | ' + o_split
                                elif p_split not in ex_partner:
                                    final_string = s_split + ' | ' + p_split + ' | ' + o_split
                                triples_name_2[name_2] = [(final_string)]
                            
for index, row in pandas_dataframe.iterrows():
    pandas_dataframe.at[index, 'Triple Entity 1'] = triples_name_1[row['entity 1']]
    pandas_dataframe.at[index, 'Triple Entity 2'] = triples_name_2[row['entity 2']]

pandas_dataframe.to_csv(os.getcwd() + '/MatchesLink.tsv', sep ='\t')
