In [1]:
# required libraries
import os
import pandas as pd
import numpy as np
from pathlib import Path
# # Load the required libraries 
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# # rdflib knows about some namespaces, like FOAF 
from rdflib.namespace import FOAF, XSD 
# CHECK DATE 
import datetime

In [2]:
# parameters and URLs"
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
print("Here is the path "+ path)
path = 'C:/Users/39344/Desktop/semproject/csv'
appearancesUrl = path + '/appearances.csv'
clubGamesUrl = path + '/club_games.csv'
clubUrl = path + '/clubs.csv'
competitionsUrl = path + '/competitions.csv'
gameEventsUrl = path + '/game_events.csv'
gameLineupsUrl = path + '/game_lineups.csv'
gamesUrl = path + '/games.csv'
playerValuationsUrl = path + '/player_valuations.csv'
playersUrl = path + '/players.csv'

# country code
countriesURL = path + '/wikipedia-iso-country-codes.csv'

print("File paths are working perfectly.")
# saving folder
savePath =  path + '/data/'
print("executed all lines")

Here is the path C:\Users\39344\Desktop
File paths are working perfectly.
executed all lines


In [3]:
# Load the CSV files in memory

print("load csv files")
comp = pd.read_csv(competitionsUrl, sep = ',', index_col = 'competition_id')
print("there is a problem")
comp.astype({'country_id': 'int32'}).dtypes

load csv files
there is a problem


competition_code        object
name                    object
sub_type                object
type                    object
country_id               int32
country                 object
domestic_league_code    object
confederation           object
url                     object
dtype: object

In [4]:
#load the country codes\
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])

In [5]:
comp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, CIT to FRCH
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   competition_code      43 non-null     object
 1   name                  43 non-null     object
 2   sub_type              43 non-null     object
 3   type                  43 non-null     object
 4   country_id            43 non-null     int64 
 5   country               36 non-null     object
 6   domestic_league_code  36 non-null     object
 7   confederation         43 non-null     object
 8   url                   43 non-null     object
dtypes: int64(1), object(8)
memory usage: 3.4+ KB


In [7]:
# Construct the country and the movie ontology namespaces not known by RDFlib\n",
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
CO = Namespace("http://www.dei.unipd.it/database2/competitionOntology#")
GM = Namespace("http://www.dei.unipd.it/database2/gameOntology#")
PO = Namespace("http://www.dei.unipd.it/database2/playersOntology#")

In [8]:
#create the graph\
g = Graph()
# Bind the namespaces to a prefix for more readable output\n",
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("co", CO)
g.bind("gm", GM)
g.bind("po", PO)

In [9]:
# CHECK DATE
import datetime

In [10]:
%%time
#iterate over the movies dataframe
for index, row in comp.iterrows():
    #Create the node to add to the Graph
    #the node has the namespace + the movie id as URI
    Competition = URIRef(CO[index])
    # Add triples using store's add() method.
    g.add((Competition, RDF.type, CO.Competition))
    g.add((Competition, CO['competitionName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Competition, CO['competitionType'], Literal(row['type'], datatype=XSD.string)))
    g.add((Competition, CO['subType'], Literal(row['sub_type'], datatype=XSD.string)))
    g.add((Competition, CO['domesticLeagueCode'], Literal(row['domestic_league_code'], datatype=XSD.string)))                                                
    g.add((Competition, CO['country_id'], Literal(row['country_id'], datatype=XSD.integer)))
    
    ## handle country
    #there can be more than one country per competition
    for c in str(row['country']).split(','):
        # check if the country exists
        # country.index == x returns an array of booleans, thus we need to use the any() method
        cName = c.strip()
        if((countries.index == cName).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            # create the RDF node
            Country = URIRef(CNS[code])
             # add the edge connecting the Competition and the Country
            g.add((Competition, CO['hasCountry'], Country))

CPU times: total: 31.2 ms
Wall time: 32.3 ms


In [11]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'competitions.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 12.2 ms


In [12]:
# Load the CSV files in memory

print("load csv files")
players = pd.read_csv(playersUrl, sep = ',', index_col = 'player_id')
print("there is a problem")
players.astype({'last_season': 'int32'}).dtypes

load csv files
there is a problem


first_name                               object
last_name                                object
name                                     object
last_season                               int32
current_club_id                           int64
player_code                              object
country_of_birth                         object
city_of_birth                            object
country_of_citizenship                   object
date_of_birth                            object
sub_position                             object
position                                 object
foot                                     object
height_in_cm                            float64
market_value_in_eur                     float64
highest_market_value_in_eur             float64
contract_expiration_date                 object
agent_name                               object
image_url                                object
url                                      object
current_club_domestic_competition_id    

In [13]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30302 entries, 598 to 925584
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   first_name                            28337 non-null  object 
 1   last_name                             30302 non-null  object 
 2   name                                  30302 non-null  object 
 3   last_season                           30302 non-null  int64  
 4   current_club_id                       30302 non-null  int64  
 5   player_code                           30302 non-null  object 
 6   country_of_birth                      27613 non-null  object 
 7   city_of_birth                         28099 non-null  object 
 8   country_of_citizenship                29759 non-null  object 
 9   date_of_birth                         30255 non-null  object 
 10  sub_position                          30130 non-null  object 
 11  position         

In [None]:
%%time
#iterate over the players dataframe
for index, row in players.iterrows():
    #Create the node to add to the Graph
    #the node has the namespace + the player id as URI
    Player = URIRef(PO[index])
    # Add triples using store's add() method.
    g.add((Player, RDF.type, PO.Player))
    g.add((Player, PO['playerCode'], Literal(row['player_code'], datatype=XSD.string)))
    g.add((Player, PO['fullName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Player, PO['playerPosition'], Literal(row['position'], datatype=XSD.string)))
    g.add((Player, PO['lastSeason'], Literal(row['last_season'], datatype=XSD.gYear)))
    try:
        datetime.datetime.strptime(str(row['date_of_birth']), '%d-%m-%y')
        g.add((Player, PO['DOB'], Literal(row['date_of_birth'], datatype=XSD.date)))
    except ValueError:
        # probably it's the year alone
        # check length
        if (len(row['date_of_birth'])==4):
            #it is a year
            g.add((Player, PO['DOB'], Literal(row['date_of_birth']+ "-01-01", datatype=XSD.date)))
                                                     
                                                     
    g.add((Player, PO['current_club'], Literal(row['current_club_id'], datatype=XSD.integer)))
    
     ## handle country
    #there can be more than one country per player
    for c in str(row['country_of_citizenship']).split(','):
        # check if the country exists
        # country.index == x returns an array of booleans, thus we need to use the any() method
        cName = c.strip()
        if((countries.index == cName).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            # create the RDF node
            Country = URIRef(CNS[code])
             # add the edge connecting the Movie and the Country
            g.add((Player, PO['hasCountry'], Country))

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'players.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))