In [1]:
# required libraries
import os
import pandas as pd
import numpy as np
from pathlib import Path
# # Load the required libraries 
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# # rdflib knows about some namespaces, like FOAF 
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
import datetime

In [2]:
# parameters and URLs"
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
print("Here is the path "+ path)
path = 'C:/Users/39344/Desktop/semproject/csv'
appearancesUrl = path + '/appearances.csv'
clubGamesUrl = path + '/club_games.csv'
clubUrl = path + '/clubs.csv'
competitionsUrl = path + '/competitions.csv'
gameEventsUrl = path + '/game_events.csv'
gameLineupsUrl = path + '/game_lineups.csv'
gamesUrl = path + '/games.csv'
playerValuationsUrl = path + '/player_valuations.csv'
playersUrl = path + '/players.csv'

# country code
countriesURL = path + '/wikipedia-iso-country-codes.csv'

print("File paths are working perfectly.")
# saving folder
savePath =  path + '/data/'
print("executed all lines")

Here is the path C:\Users\39344\Desktop
File paths are working perfectly.
executed all lines


In [3]:
# Load the CSV files in memory

print("load csv files")
comp = pd.read_csv(competitionsUrl, sep = ',', index_col = 'competition_id')
print("there is a problem")
comp.astype({'country_id': 'int32'}).dtypes

load csv files
there is a problem


competition_code        object
name                    object
sub_type                object
type                    object
country_id               int32
country                 object
domestic_league_code    object
confederation           object
url                     object
dtype: object

In [4]:
#load the country codes\
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])

In [5]:
comp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, CIT to FRCH
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   competition_code      43 non-null     object
 1   name                  43 non-null     object
 2   sub_type              43 non-null     object
 3   type                  43 non-null     object
 4   country_id            43 non-null     int64 
 5   country               36 non-null     object
 6   domestic_league_code  36 non-null     object
 7   confederation         43 non-null     object
 8   url                   43 non-null     object
dtypes: int64(1), object(8)
memory usage: 3.4+ KB


In [6]:
# Construct the country and the movie ontology namespaces not known by RDFlib\n",
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
CO = Namespace("http://www.dei.unipd.it/database2/competitionOntology#")
CB = Namespace("http://www.dei.unipd.it/database2/clubOntology#")
PO = Namespace("http://www.dei.unipd.it/database2/playersOntology#")

In [7]:
#create the graph\
g = Graph()
# Bind the namespaces to a prefix for more readable output\n",
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("co", CO)
g.bind("cb", CB)
g.bind("po", PO)

In [8]:
# CHECK DATE
import datetime

In [9]:
%%time
#iterate over the movies dataframe
for index, row in comp.iterrows():
    #Create the node to add to the Graph
    #the node has the namespace + the movie id as URI
    Competition = URIRef(CO[index])
    # Add triples using store's add() method.
    g.add((Competition, RDF.type, CO.Competition))
    g.add((Competition, CO['competitionName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Competition, CO['competitionType'], Literal(row['type'], datatype=XSD.string)))
    g.add((Competition, CO['subType'], Literal(row['sub_type'], datatype=XSD.string)))
    g.add((Competition, CO['domesticLeagueCode'], Literal(row['domestic_league_code'], datatype=XSD.string)))                                                
    g.add((Competition, CO['country_id'], Literal(row['country_id'], datatype=XSD.integer)))
    
    ## handle country
    #there can be more than one country per competition
    for c in str(row['country']).split(','):
        # check if the country exists
        # country.index == x returns an array of booleans, thus we need to use the any() method
        cName = c.strip()
        if((countries.index == cName).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            # create the RDF node
            Country = URIRef(CNS[code])
             # add the edge connecting the Competition and the Country
            g.add((Competition, CO['hasCountry'], Country))

CPU times: total: 31.2 ms
Wall time: 37.8 ms


In [10]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'competitions.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 31.2 ms
Wall time: 15.3 ms


In [13]:
# Load the CSV files in memory

print("load csv files")
club = pd.read_csv(clubUrl, sep = ',', index_col = 'club_id')
print("there is a problem")
club.astype({'national_team_players': 'int32'}).dtypes

load csv files
there is a problem


club_code                   object
name                        object
domestic_competition_id     object
total_market_value         float64
squad_size                   int64
average_age                float64
foreigners_number            int64
foreigners_percentage      float64
national_team_players        int32
stadium_name                object
stadium_seats                int64
net_transfer_record         object
coach_name                 float64
last_season                  int64
url                         object
dtype: object

In [24]:
# Load the CSV files in memory\n",
club = pd.read_csv(clubUrl, sep=',', index_col='club_id', keep_default_na=False, na_values=['_'])

In [14]:
club.info()

<class 'pandas.core.frame.DataFrame'>
Index: 426 entries, 105 to 984
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club_code                426 non-null    object 
 1   name                     426 non-null    object 
 2   domestic_competition_id  426 non-null    object 
 3   total_market_value       0 non-null      float64
 4   squad_size               426 non-null    int64  
 5   average_age              388 non-null    float64
 6   foreigners_number        426 non-null    int64  
 7   foreigners_percentage    379 non-null    float64
 8   national_team_players    426 non-null    int64  
 9   stadium_name             426 non-null    object 
 10  stadium_seats            426 non-null    int64  
 11  net_transfer_record      426 non-null    object 
 12  coach_name               0 non-null      float64
 13  last_season              426 non-null    int64  
 14  url                      426 

In [15]:
#create a new graph
g = Graph()

In [19]:
%%time

# Iterate over the DataFrame rows
for index, row in club.iterrows():
    # Create a URI for the club
    Clubs = URIRef(CB[index])
    
    # Add triples for each row
    g.add((Clubs, RDF.type, CB.Clubs))
    g.add((Clubs, CB['fullname'], Literal(row['name'], datatype=XSD.string)))
    g.add((Clubs, CB['squadSize'], Literal(row['squad_size'], datatype=XSD.integer)))
    g.add((Clubs, CB['foreigners'], Literal(row['foreigners_number'], datatype=XSD.integer)))
# Bind namespaces for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("co", CO)
g.bind("cb", CB)


CPU times: total: 78.1 ms
Wall time: 77.4 ms


In [20]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'club.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 46.9 ms
Wall time: 65.4 ms


In [11]:
# Load the CSV files in memory\n",
people = pd.read_csv(playersUrl, sep=',', index_col='player_id', keep_default_na=False, na_values=['_'])

In [12]:
people.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30302 entries, 598 to 925584
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   first_name                            30302 non-null  object
 1   last_name                             30302 non-null  object
 2   name                                  30302 non-null  object
 3   last_season                           30302 non-null  int64 
 4   current_club_id                       30302 non-null  int64 
 5   player_code                           30302 non-null  object
 6   country_of_birth                      30302 non-null  object
 7   city_of_birth                         30302 non-null  object
 8   country_of_citizenship                30302 non-null  object
 9   date_of_birth                         30302 non-null  object
 10  sub_position                          30302 non-null  object
 11  position                      

In [13]:
#create a new graph
g = Graph()

In [14]:
%%time
#iterate over the movies dataframe
for index, row in people.iterrows():
    #Create the node to add to the Graph
    #the node has the namespace + the movie id as URI
    Person = URIRef(PO[index])
    # Add triples using store's add() method.
    g.add((Person, RDF.type, PO.Person))
    g.add((Person, PO['fullName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Person, PO['playerPosition'], Literal(row['position'], datatype=XSD.string)))
    g.add((Person, PO['playerCode'], Literal(row['player_code'], datatype=XSD.string)))
    g.add((Person, PO['clubID'], Literal(row['current_club_id'], datatype=XSD.integer)))                                                
    g.add((Person, PO['lastSeason'], Literal(row['last_season'], datatype=XSD.gYear)))
    
    ## handle country
    #there can be more than one country per competition
    for c in str(row['country_of_citizenship']).split(','):
        # check if the country exists
        # country.index == x returns an array of booleans, thus we need to use the any() method
        cName = c.strip()
        if((countries.index == cName).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            # create the RDF node
            Country = URIRef(CNS[code])
             # add the edge connecting the Competition and the Country
            g.add((Person, PO['hasCountry'], Country))

CPU times: total: 17 s
Wall time: 17.1 s


In [16]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'player.ttl', 'w', newline='', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 13.2 s
Wall time: 13.2 s
