## Populate an RDF database

This notebook reports the main steps to download CSV files, process them and create an RDF dataset from them accordingly to an ontology. 

To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

In [1]:
# required libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
stats1920Url = path + '/inDepthSoccerStats/2019-2020.csv'
stats1819Url = path + '/inDepthSoccerStats/2018-2019.csv'
playersUrl = path + '/inDepthSoccerStats/players.csv'
appUrl = path + '/inDepthSoccerStats/appearances.csv'

# country codes
# countriesURL = path + '/data/countryCodes/wikipedia-iso-country-codes.csv'

# saving folder
savePath =  path + '/rdf/'

## Soccer Stats

In [3]:
# Load the CSV files in memory
stats1920 = pd.read_csv(stats1920Url, sep=',', index_col='player_name')
stats1819 = pd.read_csv(stats1819Url, sep=',', index_col='player_name')
players = pd.read_csv(playersUrl, sep=',', index_col='player_id')
app = pd.read_csv(appUrl, sep=',', index_col='appearance_id')
# cast year to int. If type(year) = str --> Literal= year-01-01
#stats1920.astype({'year': 'int32'}).dtypes

In [4]:
#load the country codes
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
#countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])

In [5]:
stats1920.info()
players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2661 entries, Jamie Vardy to Jean Onana
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        2661 non-null   int64  
 1   teams_played_for  2661 non-null   object 
 2   league            2661 non-null   object 
 3   games             2661 non-null   int64  
 4   minutes_played    2661 non-null   int64  
 5   goals             2661 non-null   int64  
 6   npg               2661 non-null   int64  
 7   assists           2661 non-null   int64  
 8   xG                2661 non-null   float64
 9   xA                2661 non-null   float64
 10  npxG              2661 non-null   float64
 11  xG90              2661 non-null   float64
 12  xA90              2661 non-null   float64
 13  npxG90            2661 non-null   float64
 14  position          2661 non-null   object 
 15  shots             2661 non-null   int64  
 16  key_passes        2661 non-null

We need to install <code>RDFLib</code>

<code>pip3 install rdflib </code> [Documentation](https://rdflib.readthedocs.io/en/stable/gettingstarted.html)

In [7]:
# Load the required libraries
#!pip install rdflib
from rdflib import Graph, Literal, RDF, URIRef, Namespace, term
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD



In [8]:
# Construct the country and the movie ontology namespaces not known by RDFlib
#CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
DCSSO = Namespace("http://www.dei.unipd.it/db2/dcsso#")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("dcsso", DCSSO)

#term.bind(
#    XSD.double,
#    float,
#   constructor=float,
#    lexicalizer=lambda val: f"{val:f}",
#    datatype_specific=True
#)

In [11]:
import datetime
#!pip install unidecode
from unidecode import unidecode
from itertools import permutations

def nameToRef(name):
    return unidecode(name.replace(" ",""))

def hyphenize(s):
    return unidecode(s.lower().replace(" ","-").replace("&#039;","'"))

def cleanChars(item):
    return item.str.replace("ć", "c").str.replace("ğ","g").str.replace("İ","i").str.replace("-scaron-","s")



def getAppsByID(ID):
    return np.size(app1819[app1819['player_id'] == ID], 0);

def resolveHomonyms(statsName):
    matchingPlayersIDs = players[players['player_code'] == statsName].index
    count = 0
    candID = ""
    for playerID in matchingPlayersIDs:
            playerApps = getAppsByID(playerID)
            if(playerApps == row['games']):
                candID = playerID
                count = count + 1
    if(count != 1):
        candID = ""
    return candID

In [13]:
%%time

#select only months and years from appearance dates
appMonth = app['date'].str.split("/").str[1]
appYear = app['date'].str.split("/").str[2]
#select appearances from 18/19 season
is1819 = ((appYear == "2018") & (appMonth >= "08")) | ((appYear == "2019") & (appMonth <= "06"))
app1819 = app[is1819]
playerCodes = cleanChars(players['player_code'])

#iterate on stats file
exact_matches = no_matches = mult_both = only_hom = resolved_hom = resolved_mult = 0
playerID = ""
for index, row in stats1819.iterrows():
    statsName = hyphenize(index).replace("'","")
    matchedPlayersIDs = players[playerCodes == statsName].index
    matches = np.size(matchedPlayersIDs, 0)
    
    #rows with same name in stats mapped to a single player are ok (he has changed team during the season)
    if(matches == 1):
        exact_matches = exact_matches + 1
        playerID = matchedPlayersIDs[0]
    
    #no matches at all on statsName
    elif(matches == 0):
        splitStatsName = statsName.split("-")
        n = np.size(splitStatsName, 0)
        if(n >= 2):
            #iterate on all possible permutations
            for perm in permutations(splitStatsName):
                if(matches == 1):
                    break
                #consider lengths from 2 to n
                for i in range(2, n + 1):
                    newName = ""
                    #generate a "new name" of length i
                    for j in range(0, i - 1):
                        newName = newName + perm[j] + "-"
                    newName = newName + perm[i - 1]
                    
                    #check if there is a player matching this new name
                    matchedPlayersIDs = players[playerCodes == newName].index
                    matches = np.size(matchedPlayersIDs, 0)
                    if(matches == 1):
                        exact_matches = exact_matches + 1
                        #print("Success: " + newName)
                        playerID = matchedPlayersIDs[0]
                        break
        
        if(matches == 1):
            continue
        #here iff all possible permutations have not generated any unique match
        else:
            no_matches = no_matches + 1
            #print("No match: "+statsName)
    
    #ambiguity: rows with same name in stats, homonyms in players
    elif(np.size(stats1819[stats1819.index == index], 0) > 1):
        mult_both = mult_both + 1
        playerID = resolveHomonyms(statsName)
        if(playerID != ""):
            resolved_mult = resolved_mult + 1
            #print("Resolved mult.: " + statsName + " -- games: " + str(row['games']))
        #else:
            #print("Unresolved hom. (mult): "+statsName+" -- stats1819 games: "+str(row['games']))
    
    #ambiguity: only homonyms
    else:
        only_hom = only_hom + 1
        playerID = resolveHomonyms(statsName)
        if(playerID != ""):
            resolved_hom = resolved_hom + 1
            #print("Resolved hom.: " + statsName + " -- games: " + str(row['games']))
        #else:
            #print("Unresolved hom.: "+statsName+" -- stats1819 games: "+str(row['games']))

#print statistics
statsRows = np.size(stats1819, 0);
print("--- STATISTICS ---")
tot_matches = exact_matches + resolved_hom + resolved_mult
print("Total matches: "+str(tot_matches)+" -- percentage: " + "{:.2f}%".format(tot_matches*100/statsRows))
print("---  details:  ---")
print("Exact matches: "+str(exact_matches)+" -- percentage: " + "{:.2f}%".format(exact_matches*100/statsRows)) 
print("No matches: "+str(no_matches)+" -- percentage: " + "{:.2f}%".format(no_matches*100/statsRows))
print("Homonyms in players: "+str(only_hom)+" -- percentage: " + "{:.2f}%".format(only_hom*100/statsRows))
print("    -----> resolved: "+str(resolved_hom)+" -- percentage: " + "{:.2f}%".format(resolved_hom*100/statsRows))
print("Double ambiguity: "+str(mult_both)+" -- percentage: " + "{:.2f}%".format(mult_both*100/statsRows))
print("    -----> resolved: "+str(resolved_mult)+" -- percentage: " + "{:.2f}%".format(resolved_mult*100/statsRows))

--- STATISTICS ---
Total matches: 2437 -- percentage: 93.91%
---  details:  ---
Exact matches: 2368 -- percentage: 91.25%
No matches: 118 -- percentage: 4.55%
Homonyms in players: 79 -- percentage: 3.04%
    -----> resolved: 57 -- percentage: 2.20%
Double ambiguity: 30 -- percentage: 1.16%
    -----> resolved: 12 -- percentage: 0.46%
CPU times: total: 8.03 s
Wall time: 16 s


In [None]:
%%time 
# measure execution time

# just a variable to diversify between footballers with the same name
a = 0
# iterate over the movies dataframe
for index, row in stats1920.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the footballer name as URI
    name = unidecode(index.replace("&#039;","'"))
    ref = name.replace(" ","")
    # add a unique code if there are players with the same name
    if(np.size(stats1920[stats1920.index == name], 0) > 1):
        #print(name + " : " + str(np.size(stats1920[stats1920.index == name], 0)))
        a = a + 1
        ref = ref + str(a)
    
    Footballer = URIRef(DCSSO[ref])
    # Add triples using store's add() method.
    g.add((Footballer, RDF.type, DCSSO.Footballer))
    g.add((Footballer, DCSSO['name'], Literal(name, datatype=XSD.string)))
    #g.add((Movie, MO['releaseYear'], Literal(row['year'], datatype=XSD.gYear)))
    
    leagueName = unidecode(row['league'])
    League = URIRef(DCSSO[leagueName.replace(" ","")])
    g.add((League, DCSSO['name'], Literal(row['league'], datatype=XSD.string)))
    
    teams = unidecode(row['teams_played_for'].replace("&#039;","'").strip())
    
    membRef = ref+"_1920_"+teams.replace(",","_").replace(" ","")
    Memb = URIRef(DCSSO[membRef])
    g.add((Footballer, DCSSO['hasMembership'], Memb))
    
    for teamName in teams.split(','):
        teamRef = teamName.replace(" ","")
        Team = URIRef(DCSSO[teamRef])
        g.add((Team, DCSSO['name'], Literal(teamName, datatype=XSD.string)))
        g.add((Team, DCSSO['participatesIn'], League))
        g.add((Memb, DCSSO['forTeam'], Team))
    
    #statistics
    g.add((Memb, DCSSO['games'], Literal(row['games'], datatype=XSD.int)))
    g.add((Memb, DCSSO['minutes'], Literal(row['minutes_played'], datatype=XSD.int)))
    g.add((Memb, DCSSO['goals'], Literal(row['goals'], datatype=XSD.int)))
    g.add((Memb, DCSSO['npg'], Literal(row['npg'], datatype=XSD.int)))
    g.add((Memb, DCSSO['assists'], Literal(row['assists'], datatype=XSD.int)))
    g.add((Memb, DCSSO['keyPasses'], Literal(row['key_passes'], datatype=XSD.int)))
    g.add((Memb, DCSSO['shots'], Literal(row['shots'], datatype=XSD.int)))
    g.add((Memb, DCSSO['yellowCards'], Literal(row['yellow_cards'], datatype=XSD.int)))
    g.add((Memb, DCSSO['redCards'], Literal(row['red_cards'], datatype=XSD.int)))
    
    g.add((Memb, DCSSO['xG'], Literal(row['xG'], datatype=XSD.double)))
    g.add((Memb, DCSSO['xA'], Literal(row['xA'], datatype=XSD.double)))
    g.add((Memb, DCSSO['npxG'], Literal(row['npxG'], datatype=XSD.double)))
    g.add((Memb, DCSSO['xG90'], Literal(row['xG90'], datatype=XSD.double)))
    g.add((Memb, DCSSO['xA90'], Literal(row['xA90'], datatype=XSD.double)))
    g.add((Memb, DCSSO['npxG90'], Literal(row['npxG90'], datatype=XSD.double)))
    g.add((Memb, DCSSO['xGBuildup'], Literal(row['xGBuildup'], datatype=XSD.double)))
    g.add((Memb, DCSSO['xGChain'], Literal(row['xGChain'], datatype=XSD.double)))
    
        
    #there can be more than one role per footballer
    for rN in row['position'].split(' '):
        g.add((Footballer, DCSSO['role'], Literal(rN.strip(), datatype=XSD.string)))    

#print("Duplicates: "+str(a/2))

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'stats1920.rdf', 'w') as file:
    file.write(g.serialize(format='xml'))
    #.decode("utf-8")


# Referential integrity
Note that in RDF we are in an open world situation. We cannot guarantee the referential integrity between the entities. 

## Person

Let us generate the RDF data relative to the movie workers.

In [None]:
# Load the CSV files in memory
people = pd.read_csv(namesUrl, sep=',', index_col='imdb_name_id', keep_default_na=False, na_values=['_'])

In [None]:
people.info()

People are modeled with the FOAF ontology. 
Refer to [FOAF Documentation](http://xmlns.com/foaf/spec/)

In [None]:
#create a new graph
g = Graph()

In [None]:
%%time 
#measure execution time

#iterate over the person dataframe
for index, row in people.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the person id as URI
    Person = URIRef(MO[index])
    g.add((Person, RDF.type, FOAF.Person))
    # Add triples using store's add() method.
    g.add((Person, FOAF['name'], Literal(row['name'], datatype=XSD.string)))
    if row['date_of_birth'] != '':
        try:
            datetime.datetime.strptime(str(row['date_of_birth']), '%Y-%m-%d')
            g.add((Person, MO['birthday'], Literal(row['date_of_birth'], datatype=XSD.date)))
        except ValueError:
            # probably it's the year alone
            # check length
            if (len(row['date_of_birth'])==4):
                #it is a year
                g.add((Person, MO['birthday'], Literal(row['date_of_birth']+"-01-01", datatype=XSD.date)))
    
    if row['place_of_birth'] != '':
        g.add((Person, MO['birthplace'], Literal(row['place_of_birth'], datatype=XSD.string)))
    
    # check if the death day is not empty--i.e., the person is still alive
    if row['date_of_death'] != '':
        try:
            datetime.datetime.strptime(str(row['date_of_death']), '%Y-%m-%d')
            g.add((Person, MO['deathDay'], Literal(row['date_of_death'], datatype=XSD.date)))
        except ValueError:
            # probably it's the year alone
            # check length
            if (len(row['date_of_death'])==4):
                #it is a year
                g.add((Person, MO['deathDay'], Literal(row['date_of_death']+"-01-01", datatype=XSD.date)))
        
# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("mo", MO)

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'names.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Person-Movie Join

In [None]:
# Load the CSV files in memory
join = pd.read_csv(joinTableUrl, sep=',', index_col='imdb_title_id', keep_default_na=False, na_values=['_'])

In [None]:
#create a new graph
g = Graph()

In [None]:
#regular expressions
import re
actor = re.compile('act*')

In [None]:
%%time 
#measure execution time

#iterate over the join table dataframe
for index, row in join.iterrows():
    # Create the node about the movie
    # note that we do not add this resource to the database (created before)
    Movie = URIRef(MO[index])
    
    # Create the node about the person
    # note that we do not add this resource to the database (created before)
    Person = URIRef(MO[row['imdb_name_id']])
    # get the role of the person
    role = row['category']
    
    # we have an actor or actress
    if actor.match(role): 
        g.add((Person, MO['acted'], Movie))
    elif (role=='director'):
        g.add((Person, MO['directed'], Movie))
    else:
        # note that, with the defined ontology, we cannot caracterize the specific role of this person in the movie. 
        # why?
        g.add((Person, MO['worked'], Movie))

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("mo", MO)

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'name_movie_join.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Awards - Oscars data
Note that if we do not check the referential integrity then we could produce ghost triple movie-nominee-oscar where the movie is not in the RDF graph.

On the other hand, we can check if an actor or a movie exists by using the DataFrame in Python. Note that this is an external check and not a constraints met by the RDF DB.



In [None]:
# Load the CSV files in memory
oscars = pd.read_csv(oscarsUrl, sep=',', keep_default_na=False, na_values=['_'])

In [None]:
from num2words import num2words
import string
import re
#create a new graph
g = Graph()

In [None]:
%%time
#iterate over the join table dataframe
for index,row in oscars.iterrows():
    #create the oscar with a custom id 
    cat = re.sub(r'[^\w\s]','',row['category'])
    Oscar = URIRef(MO['oscar_'+cat.replace(" ", "").lower()+'_'+ str(num2words(row['ceremony'], to='ordinal'))])
    
    # check if there already is at least a triple about this oscar
    if not (Oscar, None, None) in g:    
        # check if the oscar is already in the graph
        g.add((Oscar, RDF.type, MO.Oscar))
        g.add((Oscar, MO['category'], Literal(row['category'].lower(), datatype=XSD.string)))
        g.add((Oscar, MO['year'], Literal(row['year_ceremony'], datatype=XSD.gYear)))
    
    # check if there is a name matching the people, meaning that the oscar can be associated to a person
    if (people["name"] == row["name"]).any() == True :
        #there is a person with this name
        # Create the node about the person
        # note that we do not add this resource to the database (created before)
        Person = URIRef(MO[people[people["name"]==row["name"]].index[0]])
        if row['winner']:
            g.add((Person, MO['winner'], Oscar))
        else:
            g.add((Person, MO['nominated'], Oscar))
    
    # an oscar for a person is also to be considered an oscar for the movie
    # check if the movie is in our DB
    if (movies["original_title"] == row["film"]).any():
        # there is a movie with this title
        Movie = URIRef(MO[movies[movies["original_title"]==row["film"]].index[0]])
        if row['winner']:
            g.add((Movie, MO['winner'], Oscar))
        else:
            g.add((Movie, MO['nominated'], Oscar))

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("mo", MO)

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'oscars.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))