## Basket Data Processing

In [1]:
# required libraries
import pandas as pd
import os
import datetime
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [2]:
#parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

gamesUrl=path+'/data/games.csv'
teamsUrl=path+'/data/teams.csv'
rankingUrl=path+'/data/ranking.csv'
playerUrl=path+'/data/players.csv'
gameDetailUrl=path+'/data/games_details.csv'

#savePath
savePath= path + '/data/rdf/'


In [3]:
SO=Namespace("http://www.semanticweb.org/avata/ontologies/2021/10/basket")

### Games

In [4]:
#load the cvs file in memory ATTENZIONE sostituire nome e indexcol
games=pd.read_csv(gamesUrl,sep=',',index_col='GAME_ID')

#create the graph
g = Graph()
#bind the namespaces to a prefix for a more readable output
g.bind("foaf", FOAF)
g.bind("xsd",XSD)
g.bind("so",SO)

In [5]:
%%time
#iterate over the dataframe
for index,row in games.iterrows():
    #create the node to add to the graph
        
    idU= 'game'+str(index) 
    game=URIRef(SO[idU])
    #addtriples using store's add method
    g.add((game, RDF.type, SO.game))
    datetime.datetime.strptime(str(row['GAME_DATE_EST']),'%Y-%m-%d')
    g.add((game,SO['gameDate'],Literal(row['GAME_DATE_EST'],datatype=XSD.date)))
    g.add((game,SO['homeTeamWins'],Literal(row['HOME_TEAM_WINS'],datatype=XSD.boolean)))
    #add the edge connecting teams and games
    #home team
    idL="team"+str(row['HOME_TEAM_ID'])
    homeTeam=URIRef(SO[idL])
    g.add((homeTeam,SO['homeClub'],game))

    #visitor team
    idL="team"+str(row['VISITOR_TEAM_ID'])
    visitorTeam=URIRef(SO[idL]) 
    g.add((visitorTeam,SO['awayClub'],game))

Wall time: 16.2 s


In [6]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'game.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
Wall time: 18.1 s


### Teams

In [7]:
#read input
teams=pd.read_csv(teamsUrl,sep=',',index_col='TEAM_ID')

#create the graph
g=Graph()

#bind the namespaces to a prefix for a more readable output
g.bind("foaf", FOAF)
g.bind("xsd",XSD)
g.bind("so",SO)

In [8]:
%%time
#iterate over dataframe
for index,row in teams.iterrows():
    #attention here decide how to identify teams here i use only the ID
    idU="team"+str(index)
    team=URIRef(SO[idU])
    
    #add triple team 
    g.add((team,RDF.type,SO.team))
    
    g.add((team,SO['nickname'],Literal(row['NICKNAME'],datatype=XSD.string)))
    g.add((team,SO['abbreviation'],Literal(row['ABBREVIATION'],datatype=XSD.string)))
    g.add((team,SO['yearFounded'],Literal(row['YEARFOUNDED'],datatype=XSD.int)))
   
    #insert city and arena
    #I replace empty spaces with - so to have valid uri
    cityName=row['CITY'].replace(" ","-")
    
    idL="city"+str(cityName)
    city=URIRef(SO[idL])
    
    g.add((city,RDF.type,SO.city))
    g.add((team,SO['hasHomeCity'],URIRef(SO[idL])))
    g.add((city,SO['cityName'],Literal(row['CITY'],datatype=XSD.string)))
    
    
    arenaName=row['ARENA'].replace(" ","-")
    
    
    idL="arena"+str(arenaName)
    
    arena=URIRef(SO[idL])
    
    g.add((arena,RDF.type,SO.arena))
    g.add((team,SO['hasHomeArena'],URIRef(SO[idL])))
    g.add((arena,SO['arenaName'],Literal(row['ARENA'],datatype=XSD.string)))
    
    g.add((arena,SO['arenaCapacity'],Literal(row['ARENACAPACITY'],datatype=XSD.float)))
    
    #relationship between arena and city
    g.add((arena,SO['isInCity'],city))

Wall time: 107 ms


In [9]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'teams.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
Wall time: 121 ms


### Rankings

In [10]:
#read input
ranking=pd.read_csv(rankingUrl,sep=',',index_col='TEAM_ID')

#create the graph
g=Graph()

#bind the namespaces to a prefix for a more readable output
g.bind("foaf", FOAF)
g.bind("xsd",XSD)
g.bind("so",SO)

In [11]:
%%time
#iterate over the dataframe
for index,row in ranking.iterrows():
    
    idU="team"+str(index)+"-"+str(row['STANDINGSDATE'])
    
    ranking=URIRef(SO[idU])
    
    g.add((ranking,RDF.type,SO.ranking))
    
    #add datetime and rank
    datetime.datetime.strptime(str(row['STANDINGSDATE']),'%Y-%m-%d')
    g.add((ranking,SO['date'],Literal(row['STANDINGSDATE'],datatype=XSD.date)))
    
    g.add((ranking,SO['rank'],Literal(row['W_PCT'],datatype=XSD.double)))
   
    #relationship between ranking and team
    idL="team"+str(index)
    
    team=URIRef(SO[idL])
    #relationship between ranking and teams
    g.add((team,SO['hasRanking'],ranking))

Wall time: 1min 31s


In [12]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'ranking.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
Wall time: 1min 24s


### Players

In [13]:
#maybe insert keep default na
person=pd.read_csv(playerUrl,sep=',',index_col='PLAYER_ID')
#create the graph
g=Graph()

#bind the namespaces to a prefix for a more readable output
g.bind("foaf", FOAF)
g.bind("xsd",XSD)
g.bind("so",SO)

In [14]:
%%time
for index,row in person.iterrows():
    #create the node to add to the graph
    idU="person"+str(index)
    Person=URIRef(SO[idU])    
    g.add((Person,RDF.type,SO.Person))
    
    #process player name
    name=row['PLAYER_NAME'].split(' ')
    #0 per il nome print(Literal(name[0],datatype=XSD.string))
    
    #-1 per il cognome print(Literal(name[-1],datatype=XSD.string))   
    if(len('name')>1):
        g.add((Person,SO['firstName'],Literal(name[0],datatype=XSD.string)))
       
        g.add((Person,SO['lastName'],Literal(name[-1],datatype=XSD.string))) 
        
    else:
        g.add((Person,SO['lastName'],Literal(name[0],datatype=XSD.string)))
    
    
    
    
    #processing Partecipation
    #uri of partecipation composed by playerid and by season    
    
    idH="Partecipation"+str(index)+str(row['SEASON'])
    Partecipation=URIRef(SO[idH])
    g.add((Partecipation,RDF.type,SO.Partecipation))
    #relation between Partecipation and Person
    g.add((Person,SO['playedIn'],Partecipation))
    
    #relation between partecipation and Team
    idTeam="team"+str(row['TEAM_ID'])
    Team=URIRef(SO[idTeam])
    g.add((Partecipation,SO['playedFor'],Team))
    
    #insert year for partecipation ATT: IT'S AN INT?
    g.add((Partecipation,SO['year'],Literal(row['SEASON'],datatype=XSD.int)))


Wall time: 3.53 s


In [15]:
%%time
#processing play in  
gamedetails=pd.read_csv(gameDetailUrl,sep=',',index_col='GAME_ID')
for index,row in gamedetails.iterrows():         
    idU="person"+str(row['PLAYER_ID'])
    Person=URIRef(SO[idU]) 
    idL= "game"+str(index) 
    Game=URIRef(SO[idL])
   #add relationship between person and game
    g.add((Person,SO['playInGame'],Game)) 

Wall time: 1min 15s


In [16]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'person.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
Wall time: 45.6 s
