## Populate our RDF database

### Imports and data loading

In [None]:
# required libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
stats1920Url = path + '/inDepthSoccerStats/2019-2020.csv'
stats1819Url = path + '/inDepthSoccerStats/2018-2019.csv'
stats1819FBrefUrl = path + '/inDepthSoccerStats/transfermarkt_fbref_201819.csv'
playersUrl = path + '/inDepthSoccerStats/players.csv'
teamsUrl = path + '/inDepthSoccerStats/clubs.csv'
appUrl = path + '/inDepthSoccerStats/appearances.csv'
gamesUrl = path + '/inDepthSoccerStats/games.csv'

# country codes
countriesURL = path + '/inDepthSoccerStats/wikipedia-iso-country-codes.csv'

# saving folder
savePath =  path + '/rdf/'

In [None]:
# Load the CSV files in memory
stats1920 = pd.read_csv(stats1920Url, sep=',', index_col='indCol')
stats1819 = pd.read_csv(stats1819Url, sep=',', index_col='indCol')
#these dataframes store data from Transfermarkt
players = pd.read_csv(playersUrl, sep=',', index_col='player_id')
teams = pd.read_csv(teamsUrl, sep=',', index_col='club_id')
app = pd.read_csv(appUrl, sep=',', index_col='appearance_id')
games = pd.read_csv(gamesUrl, sep=',', index_col='game_id')
#FBref file used for completing some missing data
stats1819FBref = pd.read_csv(stats1819FBrefUrl, sep=';', index_col='Column1', dtype={"Attendance": "string"})

#load the country codes
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='English short name lower case', keep_default_na=False, na_values=['_'])

#stats1920.info()
#players.info()

In [None]:
# Load the required libraries
#!pip install rdflib
from rdflib import Graph, Literal, RDF, URIRef, Namespace, term
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

### Namespace and prefixes

In [None]:
# Construct the country and the movie ontology namespaces not known by RDFlib
#CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
DCSSO = Namespace("http://www.dei.unipd.it/db2/dcsso#")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("dcsso", DCSSO)

#term.bind(
#    XSD.double,
#    float,
#   constructor=float,
#    lexicalizer=lambda val: f"{val:f}",
#    datatype_specific=True
#)

### Parsing and matching utilities

In [None]:
import datetime
#!pip install unidecode
#!pip install googlesearch-python
from unidecode import unidecode
from itertools import permutations
from difflib import SequenceMatcher
from googlesearch import search

#string parsing

def nameToRef(name):
    return unidecode(name.replace(" ",""))

def hyphenize(s):
    return unidecode(s.lower().replace(" ","-").replace("&#039;","'"))

def cleanChars(item):
    return item.str.replace("ć", "c").str.replace("ğ","g").str.replace("İ","i").str.replace("-scaron-","s")

def cleanString(s):
    return unidecode(s.replace("ć", "c").replace("ğ","g").replace("İ","i"))
    

#permutations strategy

def genSeqByLength(perm, length):
    newName = ""
    for j in range(0, length - 1):
        newName = newName + perm[j] + "-"
    newName = newName + perm[length - 1]
    return newName

def match_seq(splitS1, lis):
    resultList = []
    #iterate on all possible permutations
    length = -1
    if(len(lis) == 1):
        length = len(lis[0].split("-"))
        uniqueItem = lis[0]
    for perm in permutations(splitS1):
        if(length != -1):
            newName = genSeqByLength(perm, length)
            if(uniqueItem == newName):
                return [uniqueItem]
        #consider all lengths from 2 to n
        else:
            for i in range(2, len(splitS1) + 1):
                newName = genSeqByLength(perm, i)
                for item in lis:
                    if(item == newName):
                        resultList.append(newName)
    return resultList

#multiple matches resolution

def getAppsByID(ID, apps):
    return np.size(apps[apps['player_id'] == ID], 0);

#I: number of games in the season, candidate players, list describing single appearances
#O: player from players file, or empty Series
def solve_with_apps_approx(statsGames, somePlayers, appsCol, tol=5):
    minDiff = 50
    minInd = 0
    for ind in somePlayers.index:
        if(abs(statsGames - getAppsByID(ind, appsCol)) < minDiff):
            minDiff = abs(statsGames - getAppsByID(ind, appsCol))
            minInd = ind
    if(minDiff <= tol):
        player = somePlayers[somePlayers.index == minInd].iloc[0] 
        return player, minInd
    else:
        return pd.Series([]), -1

## Matching

### Matching teams from different sources

In [None]:
%%time

statsTeamsSet = set()
for ind, row in stats1819.iterrows():
    currTeams = row['teams_played_for'].split(",")
    statsTeamsSet.update(currTeams)

statsTeams = list(statsTeamsSet)
teamIDDict = dict()
i = 0
for statsTeam in statsTeams:
    i += 1
    maxS = 0
    maxId = 0
    for tind, trow in teams.iterrows():
        sm = SequenceMatcher(None, statsTeam, trow['name'])
        sim = sm.ratio()
        if(sim > maxS):
            maxS = sim
            maxId = tind

    if(maxS < 0.8):
        splitURL = next(search(statsTeam+" transfermarkt startseite verein", num_results=1)).split("/")
        #some teams contain numbers in their name, so we need to take only the suffix of the URL
        trID = splitURL[len(splitURL) - 1]
        if(trID != ""):
            if(len(teams[teams.index == int(trID)]) == 1):
                maxId = int(trID)
                print("{:3d}".format(i)+" out of "+"{:3d}".format(len(statsTeams))+" GOOGLE: "+statsTeam+" --> "+teams.at[maxId, 'name'])
            else:
                print("Invalid ID extracted from "+URL)
        else:
            print("No ID in URL "+URL)

    teamIDDict[statsTeam] = maxId

### Matching players from different sources

In [None]:
%%time

#select only months and years from appearance dates
appMonth = app['date'].str.split("/").str[1]
appYear = app['date'].str.split("/").str[2]
#select appearances from 18/19 season
is1819 = ((appYear == "2018") & (appMonth >= "08")) | ((appYear == "2019") & (appMonth <= "06"))
app1819 = app[is1819]
playerCodes = cleanChars(players['player_code'])

#new column to store transfermarkt ID
stats1819['trID'] = [0] * len(stats1819)

#iterate on stats file
statsRows = np.size(stats1819, 0);
exact_matches = no_matches = resolved_google = resolved_permS = resolved_permP = resolved_max_sim = resolved_pres = i = 0
for index, row in stats1819.iterrows():
    i += 1
    mode = "NONE"
    player = pd.Series([])
    statsName = hyphenize(row['player_name']).replace("'","")
    
    matchedPlayers = players[playerCodes == statsName]
    
    #multiple rows with same name in stats mapped to a single player are ok (he has changed team during the season)
    if(np.size(matchedPlayers, 0) > 0):
        mode = "MATCH"
       
    if(mode == "NONE"):
        #split name in stats and use permutations strategy
        splitStatsName = statsName.split("-")
        if(len(splitStatsName) >= 2):
            matchedCodes = match_seq(splitStatsName, playerCodes)
            if(len(matchedCodes) > 0):
                matchedPlayers = players[playerCodes.isin(matchedCodes)]
                mode = "PERM1"

    if(mode == "NONE"):
        maxSim = 0
        maxC = ""
        for c in playerCodes:
            sm = SequenceMatcher(None, statsName, c)
            #do not proceed if the upper bound is too small
            if(sm.real_quick_ratio() >= 0.5):
                #remember: similarity is not commutative
                simm = sm.ratio()                  
                #if sim is big enough, try permutation strategy with name from players file
                if(simm >= 0.6):
                    splitC = c.split("-")
                    if(len(splitC) >= len(splitStatsName) and len(splitStatsName) >= 2):
                        matchedCodes = match_seq(splitC, [statsName])
                        if(len(matchedCodes) > 0):
                            newMatchedPlayers = players[players['player_code'] == c]
                            matchedPlayers = pd.concat([matchedPlayers, newMatchedPlayers])
                if(simm > maxSim):
                    maxSim = simm
                    maxC = c

        if(maxSim >= 0.95):
            matchedPlayers = players[playerCodes == maxC]
            mode = "MAXSIM"
        elif(np.size(matchedPlayers, 0) > 0):
            mode = "PERM2"

    #managing results of any method
    matches = np.size(matchedPlayers, 0)
    if(matches == 1):
        player = matchedPlayers.iloc[0]
        trID = matchedPlayers.index[0]
    if(matches > 1):
        player, trID = solve_with_apps_approx(row['games'], matchedPlayers, app1819, 3)
        if(trID == -1):
            mode = "NONE"
        else:
            mode = "PRES"
    if(mode == "NONE"):
        splitURL = next(search(row['player_name']+" "+row['teams_played_for']+" transfermarkt profil spieler", num_results=1)).split("/")
        trID = splitURL[len(splitURL) - 1]
        if(trID != ""):
            urlPlayers = players[players.index == int(trID)]
            if(len(urlPlayers) != 0):
                mode = "GOOGLE"
                player = urlPlayers.iloc[0]
            else:
                print("Invalid ID "+trID+" extracted from "+str(splitURL))
                mode = "NONE"
        else:
            print("No ID in URL "+str(splitURL))
            mode = "NONE"
    
    
    if(mode == "NONE"):
        no_matches += 1
    elif(mode == "PERM1"):
        resolved_permS += 1
    elif(mode == "PERM2"):
        resolved_permP += 1
    elif(mode == "GOOGLE"):
        resolved_google += 1
    elif(mode == "MATCH"):
        exact_matches += 1
    elif(mode == "MAXSIM"):
        resolved_max_sim += 1
    elif(mode == "PRES"):
        resolved_pres += 1

    if(mode == "NONE"):
        print("{:4d}".format(i)+" out of "+str(statsRows)+" NONE  : "+statsName+", matches: "+str(matches))
    elif(mode != "MATCH"):
        print("{:4d}".format(i)+" out of "+str(statsRows)+" "+mode.ljust(6)+": "+statsName+" --> "+player['player_code'])

    if(mode != "NONE"):
        stats1819.at[index, 'trID'] = int(trID)

In [None]:
#print statistics
print("   --- STATISTICS ---")
tot_matches = exact_matches + + resolved_google + resolved_permS + resolved_permP + resolved_max_sim + resolved_pres
print("Total matches:                            "+"{:5d}".format(tot_matches)+" -- percentage: " + "{:.2f}%".format(tot_matches*100/statsRows))
print("  ---> exact matches:                     "+"{:5d}".format(exact_matches)+" -- percentage: " + "{:.2f}%".format(exact_matches*100/statsRows)) 
print("  ---> resolved permutating statsName:    "+"{:5d}".format(resolved_permS)+" -- percentage: " + "{:.2f}%".format(resolved_permS*100/statsRows))
print("  ---> resolved permutating player code:  "+"{:5d}".format(resolved_permP)+" -- percentage: " + "{:.2f}%".format(resolved_permP*100/statsRows))
print("  ---> resolved with max sim.:            "+"{:5d}".format(resolved_max_sim)+" -- percentage: " + "{:.2f}%".format(resolved_max_sim*100/statsRows))
print("  ---> resolved with apps:                "+"{:5d}".format(resolved_pres)+" -- percentage: " + "{:.2f}%".format(resolved_pres*100/statsRows))
print("  ---> resolved with google:              "+"{:5d}".format(resolved_google)+" -- percentage: " + "{:.2f}%".format(resolved_google*100/statsRows))
print("No matches:                               "+"{:5d}".format(no_matches)+" -- percentage: " + "{:.2f}%".format(no_matches*100/statsRows))
print("  ---> zero matches found:                "+"{:5d}".format(no_matches)+" -- percentage: " + "{:.2f}%".format((no_matches)*100/statsRows))

#### Completing and correcting statistics in corner cases
We need to manage the fact some rows contain total information about a player switching team in the same league during the season. <br>
We can use information in the FBref file to complete our data; we have observed that, in this situation, it contains correct information only for the row of the two which has lower index: some statistics in the second row can be therefore corrected by subtracting the ones in the first row from the total ones. <br>
In these cases, we add two columns to our main dataframe, to specify:
* the Transfermarkt IDs of the 2 teams;
* the indexes of the rows in the FBref file corresponding to the memberships of the player in the 2 teams.

In [None]:
#statsCopy = stats1819
stats1819['teamIDs'] = [list()] * len(stats1819)
stats1819['fbref_indexes'] = [list()] * len(stats1819)

for ind, row in stats1819.iterrows():
    teamIDs = list()
    for team in row['teams_played_for'].split(","):
        teamIDs.append(int(teamIDDict[team]))
    stats1819.at[ind, 'teamIDs'] = teamIDs
    if(len(teamIDs) == 2):
        player = players.loc[row['trID']]
        fbmatch = stats1819FBref[stats1819FBref['player'] == player['name']]
        if(len(fbmatch) == 2):
            team0 = teams.loc[teamIDs[0]]['name']

            #select the index pointing to wrong row
            if(fbmatch.index[0] < fbmatch.index[1]):
                toCorrectInd = fbmatch.index[1]
            else:
                toCorrectInd = fbmatch.index[0]
            #correct the row with higher index
            stats1819FBref.at[toCorrectInd, 'goals'] = row['goals'] - stats1819FBref.at[toCorrectInd, 'goals']
            stats1819FBref.at[toCorrectInd, 'minutes'] = row['minutes_played'] - stats1819FBref.at[toCorrectInd, 'minutes']
            stats1819FBref.at[toCorrectInd, 'pens_made'] = row['goals'] - row['npg'] - stats1819FBref.at[toCorrectInd, 'pens_made']
            stats1819FBref.at[toCorrectInd, 'assists'] = row['assists'] - stats1819FBref.at[toCorrectInd, 'assists']
            
            
            sm0 = SequenceMatcher(None, team0, fbmatch.iloc[0]['squad'])
            sm1 = SequenceMatcher(None, team0, fbmatch.iloc[1]['squad'])

            #if the team corresponding to first ID matches with fbref row with index fbmatch.index[1], swap team IDs
            if(sm0.ratio() < sm1.ratio()):
                stats1819.at[ind, 'teamIDs'] = [teamIDs[1], teamIDs[0]]

            stats1819.at[ind, 'fbref_indexes'] = [fbmatch.index[0], fbmatch.index[1]]
            
for ind, row in stats1819.iterrows():
    fbrefin = row['fbref_indexes']
    if(len(fbrefin) == 2):
        print(row['player_name']+" had "+str(stats1819FBref.loc[row['fbref_indexes'][0]]['games'])+" apps for "+teams.loc[row['teamIDs'][0]]['name'])
        print(row['player_name']+" had "+str(stats1819FBref.loc[row['fbref_indexes'][1]]['games'])+" apps for "+teams.loc[row['teamIDs'][1]]['name'])
            
#result: teamIDs = ID of team0, ID of team1; fbred_indexes = index of fbref row describing membership to team0, // to team1.

### Graph population

#### Countries

In [None]:
%%time

for ind, row in countries.iterrows():
    country = URIRef(DCSSO[row['Alpha-2 code']])
    g.add((country, RDF.type, DCSSO.Country))
    g.add((country, FOAF.name, Literal(cleanString(ind), datatype=XSD.string)))

#### Leagues
Leagues are added manually because we need to store only five using very limited information from the file.

In [None]:
%%time

SerieA = URIRef(DCSSO["IT1"])
g.add((SerieA, RDF.type, DCSSO.League))
g.add((SerieA, FOAF['name'], Literal("Serie A", datatype=XSD.string)))
g.add((SerieA, DCSSO['hasCountry'], URIRef(DCSSO["IT"])))

Ligue1 = URIRef(DCSSO["FR1"])
g.add((Ligue1, RDF.type, DCSSO.League))
g.add((Ligue1, FOAF['name'], Literal("Ligue 1", datatype=XSD.string)))
g.add((Ligue1, DCSSO['hasCountry'], URIRef(DCSSO["FR"])))

LaLiga = URIRef(DCSSO["ES1"])
g.add((LaLiga, RDF.type, DCSSO.League))
g.add((LaLiga, FOAF['name'], Literal("LaLiga", datatype=XSD.string)))
g.add((SerieA, DCSSO['hasCountry'], URIRef(DCSSO["ES"])))

Premier = URIRef(DCSSO["GB1"])
g.add((Premier, RDF.type, DCSSO.League))
g.add((Premier, FOAF['name'], Literal("Premier League", datatype=XSD.string)))
g.add((Premier, DCSSO['hasCountry'], URIRef(DCSSO["GB"])))

Bundesliga = URIRef(DCSSO["L1"])
g.add((Bundesliga, RDF.type, DCSSO.League))
g.add((Bundesliga, FOAF['name'], Literal("Bundesliga", datatype=XSD.string)))
g.add((Bundesliga, DCSSO['hasCountry'], URIRef(DCSSO["DE"])))

#### Teams

In [None]:
%%time

for ind, row in teams.iterrows():
    Team = URIRef(DCSSO["team"+str(ind)])
    g.add((Team, RDF.type, DCSSO.Team))
    g.add((Team, FOAF.name, Literal(cleanString(row['name']), datatype=XSD.string)))
    g.add((Team, DCSSO['participatesIn'], URIRef(DCSSO[row['domestic_competition_id']])))
    for y in range(2015, 2020):
        #check if there are any games for this team in season starting in year y
        if((games[games['season'] == y]['home_club_id'] == ind).any() == True):
            #if there are any, y represents a season in which this team has played in its domestic top league
            Participation = URIRef(DCSSO["part"+str(ind)+"s"+str(y)])
            g.add((Participation, RDF.type, DCSSO.SeasonalParticipation))
            g.add((Team, DCSSO['hasParticipation'], Participation))
            g.add((Participation, DCSSO['season'], Literal(y, datatype=XSD.int)))

#### Players

In [None]:
%%time

players['first_name'] = players['first_name'].fillna("")
players['last_name'] = players['last_name'].fillna("")

# iterate over the movies dataframe
for index, row in stats1819.iterrows():
    tmId = str(row['trID'])
    player = players.loc[row['trID']]
    # the node has the namespace + the transfermarkt ID as URI
    ref = "player"+tmId
    Footballer = URIRef(DCSSO[ref])
    g.add((Footballer, RDF.type, DCSSO.Footballer))
    if(player['first_name'] != ""):
        g.add((Footballer, FOAF['firstName'], Literal(cleanString(player['first_name']), datatype=XSD.string)))
    if(player['last_name'] != ""):
        g.add((Footballer, FOAF['familyName'], Literal(cleanString(player['last_name']), datatype=XSD.string)))

    if(len(row['fbref_indexes']) == 2):
        for tid in row['teamIDs']:
            teamId = "team"+str(tid)

            Memb = URIRef(DCSSO["memb"+tmId+"s"+str(y)+teamId])
            g.add((Memb, RDF.type, DCSSO.SeasonalMembership))
            g.add((Footballer, DCSSO['hasMembership'], Memb))
            g.add((Memb, DCSSO['season'], Literal("2018", datatype=XSD.int)))
            g.add((Memb, DCSSO['forTeam'], URIRef(DCSSO[teamId])))

            #statistics
            g.add((Memb, DCSSO['games'], Literal(row['games'], datatype=XSD.int)))
            g.add((Memb, DCSSO['minutes'], Literal(row['minutes_played'], datatype=XSD.int)))
            g.add((Memb, DCSSO['goals'], Literal(row['goals'], datatype=XSD.int)))
            g.add((Memb, DCSSO['npg'], Literal(row['npg'], datatype=XSD.int)))
            g.add((Memb, DCSSO['assists'], Literal(row['assists'], datatype=XSD.int)))
            g.add((Memb, DCSSO['keyPasses'], Literal(row['key_passes'], datatype=XSD.int)))
            g.add((Memb, DCSSO['shots'], Literal(row['shots'], datatype=XSD.int)))
            g.add((Memb, DCSSO['yellowCards'], Literal(row['yellow_cards'], datatype=XSD.int)))
            g.add((Memb, DCSSO['redCards'], Literal(row['red_cards'], datatype=XSD.int)))
    
            g.add((Memb, DCSSO['xG'], Literal(row['xG'], datatype=XSD.double)))
            g.add((Memb, DCSSO['xA'], Literal(row['xA'], datatype=XSD.double)))
            g.add((Memb, DCSSO['npxG'], Literal(row['npxG'], datatype=XSD.double)))
            g.add((Memb, DCSSO['xG90'], Literal(row['xG90'], datatype=XSD.double)))
            g.add((Memb, DCSSO['xA90'], Literal(row['xA90'], datatype=XSD.double)))
            g.add((Memb, DCSSO['npxG90'], Literal(row['npxG90'], datatype=XSD.double)))
            g.add((Memb, DCSSO['xGBuildup'], Literal(row['xGBuildup'], datatype=XSD.double)))
            g.add((Memb, DCSSO['xGChain'], Literal(row['xGChain'], datatype=XSD.double)))
    
    
    
    if(player['position'] != "Missing"):
        subPosition = player['sub_position'].replace(" ", "").replace("-", "")
        if(player['position'] == "Goalkeeper" or player['position'] == "Defender"):
            g.add((Footballer, DCSSO['position'], DCSSO[player['position']]))
        elif(player['position'] == "Midfield"):
            g.add((Footballer, DCSSO['position'], DCSSO["Midfielder"]))
            subPosition += "er"
        else:
            g.add((Footballer, DCSSO['position'], DCSSO["Forward"]))

        g.add((Footballer, DCSSO['subPosition'], DCSSO[subPosition]))

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'stats1819.rdf', 'w') as file:
    file.write(g.serialize(format='xml'))
    #.decode("utf-8")


In [None]:
# !pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT ?oLabel
WHERE
{
wd:Q192923 skos:altLabel ?o.
FILTER(isLiteral(?o))
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    print(result["oLabel"]["value"])

# Referential integrity
Note that in RDF we are in an open world situation. We cannot guarantee the referential integrity between the entities. 

## Person

Let us generate the RDF data relative to the movie workers.

In [None]:
# Load the CSV files in memory
people = pd.read_csv(namesUrl, sep=',', index_col='imdb_name_id', keep_default_na=False, na_values=['_'])

In [None]:
people.info()

People are modeled with the FOAF ontology. 
Refer to [FOAF Documentation](http://xmlns.com/foaf/spec/)

In [None]:
#create a new graph
g = Graph()

In [None]:
%%time 
#measure execution time

#iterate over the person dataframe
for index, row in people.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the person id as URI
    Person = URIRef(MO[index])
    g.add((Person, RDF.type, FOAF.Person))
    # Add triples using store's add() method.
    g.add((Person, FOAF['name'], Literal(row['name'], datatype=XSD.string)))
    if row['date_of_birth'] != '':
        try:
            datetime.datetime.strptime(str(row['date_of_birth']), '%Y-%m-%d')
            g.add((Person, MO['birthday'], Literal(row['date_of_birth'], datatype=XSD.date)))
        except ValueError:
            # probably it's the year alone
            # check length
            if (len(row['date_of_birth'])==4):
                #it is a year
                g.add((Person, MO['birthday'], Literal(row['date_of_birth']+"-01-01", datatype=XSD.date)))
    
    if row['place_of_birth'] != '':
        g.add((Person, MO['birthplace'], Literal(row['place_of_birth'], datatype=XSD.string)))
    
    # check if the death day is not empty--i.e., the person is still alive
    if row['date_of_death'] != '':
        try:
            datetime.datetime.strptime(str(row['date_of_death']), '%Y-%m-%d')
            g.add((Person, MO['deathDay'], Literal(row['date_of_death'], datatype=XSD.date)))
        except ValueError:
            # probably it's the year alone
            # check length
            if (len(row['date_of_death'])==4):
                #it is a year
                g.add((Person, MO['deathDay'], Literal(row['date_of_death']+"-01-01", datatype=XSD.date)))
        
# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("mo", MO)

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'names.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Person-Movie Join

In [None]:
# Load the CSV files in memory
join = pd.read_csv(joinTableUrl, sep=',', index_col='imdb_title_id', keep_default_na=False, na_values=['_'])

In [None]:
#create a new graph
g = Graph()

In [None]:
#regular expressions
import re
actor = re.compile('act*')

In [None]:
%%time 
#measure execution time

#iterate over the join table dataframe
for index, row in join.iterrows():
    # Create the node about the movie
    # note that we do not add this resource to the database (created before)
    Movie = URIRef(MO[index])
    
    # Create the node about the person
    # note that we do not add this resource to the database (created before)
    Person = URIRef(MO[row['imdb_name_id']])
    # get the role of the person
    role = row['category']
    
    # we have an actor or actress
    if actor.match(role): 
        g.add((Person, MO['acted'], Movie))
    elif (role=='director'):
        g.add((Person, MO['directed'], Movie))
    else:
        # note that, with the defined ontology, we cannot caracterize the specific role of this person in the movie. 
        # why?
        g.add((Person, MO['worked'], Movie))

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("mo", MO)

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'name_movie_join.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Awards - Oscars data
Note that if we do not check the referential integrity then we could produce ghost triple movie-nominee-oscar where the movie is not in the RDF graph.

On the other hand, we can check if an actor or a movie exists by using the DataFrame in Python. Note that this is an external check and not a constraints met by the RDF DB.



In [None]:
# Load the CSV files in memory
oscars = pd.read_csv(oscarsUrl, sep=',', keep_default_na=False, na_values=['_'])

In [None]:
from num2words import num2words
import string
import re
#create a new graph
g = Graph()

In [None]:
%%time
#iterate over the join table dataframe
for index,row in oscars.iterrows():
    #create the oscar with a custom id 
    cat = re.sub(r'[^\w\s]','',row['category'])
    Oscar = URIRef(MO['oscar_'+cat.replace(" ", "").lower()+'_'+ str(num2words(row['ceremony'], to='ordinal'))])
    
    # check if there already is at least a triple about this oscar
    if not (Oscar, None, None) in g:    
        # check if the oscar is already in the graph
        g.add((Oscar, RDF.type, MO.Oscar))
        g.add((Oscar, MO['category'], Literal(row['category'].lower(), datatype=XSD.string)))
        g.add((Oscar, MO['year'], Literal(row['year_ceremony'], datatype=XSD.gYear)))
    
    # check if there is a name matching the people, meaning that the oscar can be associated to a person
    if (people["name"] == row["name"]).any() == True :
        #there is a person with this name
        # Create the node about the person
        # note that we do not add this resource to the database (created before)
        Person = URIRef(MO[people[people["name"]==row["name"]].index[0]])
        if row['winner']:
            g.add((Person, MO['winner'], Oscar))
        else:
            g.add((Person, MO['nominated'], Oscar))
    
    # an oscar for a person is also to be considered an oscar for the movie
    # check if the movie is in our DB
    if (movies["original_title"] == row["film"]).any():
        # there is a movie with this title
        Movie = URIRef(MO[movies[movies["original_title"]==row["film"]].index[0]])
        if row['winner']:
            g.add((Movie, MO['winner'], Oscar))
        else:
            g.add((Movie, MO['nominated'], Oscar))

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("mo", MO)

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'oscars.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))