# Populate Knowledge Graph

In [1]:
import pandas as pd
import numpy as np
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD

## Load Data

In [2]:
# Full Dataset

# player_df = pd.read_parquet( r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\players_clean.parquet")
# team_df = pd.read_parquet( r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\teams_clean.parquet")
# league_df = pd.read_parquet( r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\league_clean.parquet")

In [3]:
# sunsampled data
player_df = pd.read_parquet( r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\players_small.parquet")
team_df = pd.read_parquet( r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\teams_small.parquet")
league_df = pd.read_parquet( r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\league_small.parquet")

In [4]:
# Load ontology
g = Graph()
ontology_file = "C:\mahmoud uni\TU\SS2024\KGs\Portfolio\EA_FC_ontology.ttl"
g.parse(ontology_file, format="ttl")

# namespaces
BASE = Namespace("http://www.semanticweb.org/mabsa/ontologies/2024/10/ea-fc-ontology/")
g.bind("base", BASE)
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

## Populate the Knowledge Graph

In [5]:
#player_df.info()

In [6]:
# Players
count = 0

for _, row in player_df.iterrows():
    # URIs for Player and Club
    player_uri = BASE[f"Player_{row['key']}"]
    club_uri = BASE[f"Club_{row['team_key']}"]
    league_uri = BASE[f"League_{row['league_key']}"]
    
    g.add((player_uri, RDF.type, BASE.Player))
    
    # Data properties
    g.add((player_uri, BASE.name, Literal(row['long_name'], datatype=XSD.string)))
    g.add((player_uri, BASE.age, Literal(row['age'], datatype=XSD.integer)))
    g.add((player_uri, BASE.club_contract_valid_until_year, Literal(row['club_contract_valid_until_year'], datatype=XSD.integer)))
    g.add((player_uri, BASE.club_joined_date, Literal(row['club_joined_date'], datatype=XSD.date)))
    g.add((player_uri, BASE.club_position, Literal(row['club_position'], datatype=XSD.integer)))
    g.add((player_uri, BASE.defending, Literal(row['defending'], datatype=XSD.integer)))
    g.add((player_uri, BASE.dribbling, Literal(row['dribbling'], datatype=XSD.integer)))
    g.add((player_uri, BASE.fifa_version, Literal(row['fifa_version'], datatype=XSD.integer)))    
    g.add((player_uri, BASE.height_cm, Literal(row['height_cm'], datatype=XSD.float)))
    g.add((player_uri, BASE.on_loan, Literal(row['on_loan'], datatype=XSD.boolean)))
    g.add((player_uri, BASE.overall, Literal(row['overall'], datatype=XSD.integer)))
    g.add((player_uri, BASE.pace, Literal(row['pace'], datatype=XSD.integer)))
    g.add((player_uri, BASE.passing, Literal(row['passing'], datatype=XSD.integer)))
    g.add((player_uri, BASE.physic, Literal(row['physic'], datatype=XSD.integer)))
    g.add((player_uri, BASE.player_id, Literal(row['key'], datatype=XSD.string)))
    g.add((player_uri, BASE.position_category, Literal(row['position_category'], datatype=XSD.string)))
    g.add((player_uri, BASE.potential, Literal(row['potential'], datatype=XSD.integer)))
    g.add((player_uri, BASE.shooting, Literal(row['shooting'], datatype=XSD.integer)))
    g.add((player_uri, BASE.skill_moves, Literal(row['skill_moves'], datatype=XSD.integer)))
    g.add((player_uri, BASE.value_eur, Literal(row['value_eur'], datatype=XSD.float)))
    g.add((player_uri, BASE.wage_eur, Literal(row['wage_eur'], datatype=XSD.float)))
    g.add((player_uri, BASE.weak_foot, Literal(row['weak_foot'], datatype=XSD.integer)))
    g.add((player_uri, BASE.weight_kg, Literal(row['weight_kg'], datatype=XSD.float)))
    g.add((player_uri, BASE.preferred_foot, Literal(row['preferred_foot'], datatype=XSD.boolean)))

    g.add((player_uri, BASE.work_rate, Literal(row['work_rate'], datatype=XSD.string)))
    g.add((player_uri, BASE.body_type, Literal(row['body_type'], datatype=XSD.string)))

    g.add((player_uri, BASE.movement_acceleration, Literal(row['movement_acceleration'], datatype=XSD.integer)))
    g.add((player_uri, BASE.movement_sprint_speed, Literal(row['movement_sprint_speed'], datatype=XSD.integer)))
    g.add((player_uri, BASE.movement_agility, Literal(row['movement_agility'], datatype=XSD.integer)))
    g.add((player_uri, BASE.movement_reactions, Literal(row['movement_reactions'], datatype=XSD.integer)))
    g.add((player_uri, BASE.movement_balance, Literal(row['movement_balance'], datatype=XSD.integer)))

    g.add((player_uri, BASE.power_shot_power, Literal(row['power_shot_power'], datatype=XSD.integer)))
    g.add((player_uri, BASE.power_jumping, Literal(row['power_jumping'], datatype=XSD.integer)))
    g.add((player_uri, BASE.power_stamina, Literal(row['power_stamina'], datatype=XSD.integer)))
    g.add((player_uri, BASE.power_strength, Literal(row['power_strength'], datatype=XSD.integer)))
    g.add((player_uri, BASE.power_long_shots, Literal(row['power_long_shots'], datatype=XSD.integer)))

    g.add((player_uri, BASE.mentality_aggression, Literal(row['mentality_aggression'], datatype=XSD.integer)))
    g.add((player_uri, BASE.mentality_interceptions, Literal(row['mentality_interceptions'], datatype=XSD.integer)))
    g.add((player_uri, BASE.mentality_positioning, Literal(row['mentality_positioning'], datatype=XSD.integer)))
    g.add((player_uri, BASE.mentality_vision, Literal(row['mentality_vision'], datatype=XSD.integer)))
    g.add((player_uri, BASE.mentality_penalties, Literal(row['mentality_penalties'], datatype=XSD.integer)))
    g.add((player_uri, BASE.mentality_composure, Literal(row['mentality_composure'], datatype=XSD.integer)))

    g.add((player_uri, BASE.defending_marking_awareness, Literal(row['defending_marking_awareness'], datatype=XSD.integer)))
    g.add((player_uri, BASE.defending_standing_tackle, Literal(row['defending_standing_tackle'], datatype=XSD.integer)))
    g.add((player_uri, BASE.defending_sliding_tackle, Literal(row['defending_sliding_tackle'], datatype=XSD.integer)))

    g.add((player_uri, BASE.goalkeeping_diving, Literal(row['goalkeeping_diving'], datatype=XSD.integer)))
    g.add((player_uri, BASE.goalkeeping_handling, Literal(row['goalkeeping_handling'], datatype=XSD.integer)))
    g.add((player_uri, BASE.goalkeeping_kicking, Literal(row['goalkeeping_kicking'], datatype=XSD.integer)))
    g.add((player_uri, BASE.goalkeeping_positioning, Literal(row['goalkeeping_positioning'], datatype=XSD.integer)))
    g.add((player_uri, BASE.goalkeeping_reflexes, Literal(row['goalkeeping_reflexes'], datatype=XSD.integer)))
    g.add((player_uri, BASE.goalkeeping_speed, Literal(row['goalkeeping_speed'], datatype=XSD.integer)))

    # Object Properties
    g.add((player_uri, BASE.plays_for, club_uri))  # Relationship to Club
    g.add((player_uri, BASE.competes_in, league_uri))  # Relationship to League


    count += 1
    print(f"\r{count}/{player_df.shape[0]}", end="")


8672/8672

In [7]:

# # Create teammates relationships
# from itertools import combinations

# print("\nAdding teammates relationships...")

# # Group by team_key and fifa_version to get players in the same team at the same time
# team_groups = player_df.groupby(['team_key', 'fifa_version'])['key'].apply(list)

# # Iterate over each group (team in a specific FIFA version)
# for (team_key, fifa_version), player_keys in team_groups.items():
#     # Get all possible pairs of players within the same team (without redundancy)
#     for player_a, player_b in combinations(player_keys, 2):
#         player_a_uri = BASE[f"Player_{player_a}"]
#         player_b_uri = BASE[f"Player_{player_b}"]
        
#         # Add teammates relationship (symmetric)
#         g.add((player_a_uri, BASE.teammates, player_b_uri))
#         g.add((player_b_uri, BASE.teammates, player_a_uri))  # Explicit symmetry (optional)

# print("Teammates relationships added.")

In [8]:
# from multiprocessing import Pool
# from itertools import combinations

# def process_team(team_players):
#     player_uris = [BASE[f"Player_{player['key']}"] for _, player in team_players.iterrows()]
#     for player_a, player_b in combinations(player_uris, 2):
#         g.add((player_a, BASE.plays_with, player_b))
#         g.add((player_b, BASE.plays_with, player_a))

# teams = [group for _, group in player_df.groupby('team_key')]
# with Pool() as pool:
#     pool.map(process_team, teams)

'''This part of the code will not be used and the object property 
will be removed as the run time is too extensive for the scope of the project'''

'This part of the code will not be used and the object property \nwill be removed as the run time is too extensive for the scope of the project'

In [9]:
# Clubs
count = 0

for _, row in team_df.iterrows():
    club_uri = BASE[f"Club_{row['team_key']}"]
    league_uri = BASE[f"League_{row['league_key']}"]
    rival_uri = BASE[f"Club_{row['rival_key']}"]
    
    # Club type
    g.add((club_uri, RDF.type, BASE.Club))
    
    # Data Properties
    g.add((club_uri, BASE.name, Literal(row['team_name'], datatype=XSD.string)))
    g.add((club_uri, BASE.overall, Literal(row['overall'], datatype=XSD.integer)))
    g.add((club_uri, BASE.attack, Literal(row['attack'], datatype=XSD.integer)))
    g.add((club_uri, BASE.midfield, Literal(row['midfield'], datatype=XSD.integer)))
    g.add((club_uri, BASE.defence, Literal(row['defence'], datatype=XSD.integer)))
    g.add((club_uri, BASE.transfer_budget_eur, Literal(row['transfer_budget_eur'], datatype=XSD.float)))
    g.add((club_uri, BASE.club_worth_eur, Literal(row['club_worth_eur'], datatype=XSD.float)))
    g.add((club_uri, BASE.starting_xi_average_age, Literal(row['starting_xi_average_age'], datatype=XSD.float)))
    g.add((club_uri, BASE.whole_team_average_age, Literal(row['whole_team_average_age'], datatype=XSD.float)))
    g.add((club_uri, BASE.domestic_prestige, Literal(row['domestic_prestige'], datatype=XSD.integer)))
    g.add((club_uri, BASE.international_prestige, Literal(row['international_prestige'], datatype=XSD.integer)))
    
    # Object Properties
    g.add((club_uri, BASE.part_of_league, league_uri))  # Relationship to League
    g.add((club_uri, BASE.rival_with, rival_uri))

    # Add Rivals Relation (if rival_key exists)
    if not pd.isna(row['rival_key']) and row['rival_key'] in team_df['team_key'].values:
        rival_uri = BASE[f"Club_{row['rival_key']}"]
        # Add symmetric rivalry relationship
        g.add((club_uri, BASE.rivals_with, rival_uri))
        g.add((rival_uri, BASE.rivals_with, club_uri))  # Ensuring symmetry
        
    count += 1
    print(f"\r{count}/{team_df.shape[0]}", end="")




2163/2163

In [10]:
# Leagues
count = 0

for _, row in league_df.iterrows():
    league_uri = BASE[f"League_{row['league_key']}"]
    
    g.add((league_uri, RDF.type, BASE.League))
    
    # Data Properties
    g.add((league_uri, BASE.name, Literal(row['league_name'], datatype=XSD.string)))
    g.add((league_uri, BASE.league_level, Literal(row['league_level'], datatype=XSD.integer)))
    g.add((league_uri, BASE.overall, Literal(row['overall'], datatype=XSD.integer)))
    g.add((league_uri, BASE.attack, Literal(row['attack'], datatype=XSD.integer)))
    g.add((league_uri, BASE.midfield, Literal(row['midfield'], datatype=XSD.integer)))
    g.add((league_uri, BASE.defence, Literal(row['defence'], datatype=XSD.integer)))
    g.add((league_uri, BASE.transfer_budget_eur, Literal(row['transfer_budget_eur'], datatype=XSD.float)))
    g.add((league_uri, BASE.club_worth_eur, Literal(row['club_worth_eur'], datatype=XSD.float)))
    g.add((league_uri, BASE.league_nationality_name, Literal(row['league_nationality_name'], datatype=XSD.string)))

    count += 1
    print(f"\r{count}/{league_df.shape[0]}", end="")


175/175

## Save populated Graph

In [11]:
# Save the graph to a Turtle file
output_file = r"C:\mahmoud uni\TU\SS2024\KGs\Portfolio\dataset\EA_FC_knowledge_graph.ttl"
g.serialize(destination=output_file, format="ttl")

<Graph identifier=Nc36b03dd7d3944918639d31d74e40f4a (<class 'rdflib.graph.Graph'>)>

## Validation

In [12]:
# Count all triples
print(f"Total triples: {len(g)}")

# class membership
player_count = len(list(g.subjects(RDF.type, BASE.Player)))
print(f"Total Players: {player_count}")

team_count = len(list(g.subjects(RDF.type, BASE.Club)))
print(f"Total Teams: {team_count}")

league_count = len(list(g.subjects(RDF.type, BASE.League)))
print(f"Total Leagues: {league_count}")

Total triples: 503731
Total Players: 8672
Total Teams: 2163
Total Leagues: 175


In [13]:
# players missing plays_for relationship
players = g.subjects(RDF.type, BASE.Player)
missing_plays_for = [
    player for player in players 
    if not (player, BASE.plays_for, None) in g
]

print(f"Players without 'plays_for' relationships: {len(missing_plays_for)}")

Players without 'plays_for' relationships: 0


In [14]:
invalid_plays_for = [
    (player, club) for player, club in g.subject_objects(BASE.plays_for)
    if not (club, RDF.type, BASE.Club) in g
]

if invalid_plays_for:
    print("Invalid 'plays_for' relationships found:")
    for player, club in invalid_plays_for:
        print(f"Player {player} points to non-club {club}")
else:
    print("All 'plays_for' relationships are valid.")

All 'plays_for' relationships are valid.


In [15]:
clubs = g.subjects(RDF.type, BASE.Club)
missing_league = [
    club for club in clubs 
    if not (club, BASE.part_of_league, None) in g
]

print(f"Clubs missing 'part_of_league' relationships: {len(missing_league)}")


Clubs missing 'part_of_league' relationships: 0


In [16]:
# isolated nodes (not appearing as subject or object in any triple)
nodes = set(g.subjects()).union(set(g.objects()))
isolated_nodes= [
    node for node in nodes 
    if not (node, None, None) in g and not (None, None, node) in g
]

print(f"Isolated nodes: {len(isolated_nodes)}")


Isolated nodes: 0


In [17]:
## if needed

# dtype_player = {
#     "key": "str",                           # Unique player key
#     "player_id": "int64",                   # Player ID as integer
#     "player_url": "str",                    # Player URL
#     "fifa_version": "int64",                # FIFA version
#     "long_name": "str",                     # Player full name
#     "player_positions": "str",              # Positions (e.g., "ST, LW")
#     "overall": "int64",                     # Overall rating
#     "position_category": "str",             # Position category (e.g., ATT, MID)
#     "potential": "int64",                   # Player's potential rating
#     "value_eur": "float64",                 # Player's value in euros
#     "wage_eur": "float64",                  # Wage in euros (converted from string)
#     "age": "int64",                         # Age
#     "height_cm": "int64",                   # Height in cm
#     "weight_kg": "int64",                   # Weight in kg
#     "club_team_id": "int64",                # Club team ID
#     "team_key": "str",                      # Team key
#     "league_name": "str",                   # League name
#     "league_nationality_name": "str",       # League's nationality
#     "league_id": "int64",                   # League ID
#     "league_key": "str",                    # League key
#     "club_name": "str",                     # Club name
#     "league_level": "int64",                # League level
#     "club_position": "int64",             # Club position (float due to NaN or partial values)
#     "club_joined_date": "str",              # Date when player joined the club
#     "club_contract_valid_until_year": "int64",  # Contract valid year
#     "nationality_id": "int64",              # Nationality ID
#     "nationality_name": "str",              # Nationality name
#     "preferred_foot": "int64",            # Preferred foot (0 or 1, float to handle NaN)
#     "weak_foot": "int64",                   # Weak foot rating
#     "skill_moves": "int64",                 # Skill moves rating
#     "international_reputation": "int64",   # International reputation
#     "work_rate": "str",                     # Work rate (e.g., High/Low)
#     "body_type": "str",                     # Body type (e.g., Unique, Normal)
#     "pace": "int64",                      # Pace rating
#     "shooting": "int64",                  # Shooting rating
#     "passing": "int64",                   # Passing rating
#     "dribbling": "int64",                 # Dribbling rating
#     "defending": "int64",                 # Defending rating
#     "physic": "int64",                    # Physical rating
#     "attacking_crossing": "int64",          # Attacking crossing rating
#     "attacking_finishing": "int64",         # Attacking finishing rating
#     "attacking_heading_accuracy": "int64", # Heading accuracy
#     "attacking_short_passing": "int64",     # Short passing
#     "attacking_volleys": "int64",           # Volleys
#     "skill_dribbling": "int64",             # Dribbling skill
#     "skill_curve": "int64",                 # Curve skill
#     "skill_fk_accuracy": "int64",           # Free-kick accuracy
#     "skill_long_passing": "int64",          # Long passing skill
#     "skill_ball_control": "int64",          # Ball control skill
#     "movement_acceleration": "int64",       # Acceleration
#     "movement_sprint_speed": "int64",       # Sprint speed
#     "movement_agility": "int64",            # Agility
#     "movement_reactions": "int64",          # Reactions
#     "movement_balance": "int64",            # Balance
#     "power_shot_power": "int64",            # Shot power
#     "power_jumping": "int64",               # Jumping
#     "power_stamina": "int64",               # Stamina
#     "power_strength": "int64",              # Strength
#     "power_long_shots": "int64",            # Long shots
#     "mentality_aggression": "int64",        # Aggression
#     "mentality_interceptions": "int64",     # Interceptions
#     "mentality_positioning": "int64",       # Positioning
#     "mentality_vision": "int64",            # Vision
#     "mentality_penalties": "int64",         # Penalties
#     "mentality_composure": "int64",       # Composure (float due to NaN or partial values)
#     "defending_marking_awareness": "int64", # Marking awareness
#     "defending_standing_tackle": "int64",   # Standing tackle
#     "defending_sliding_tackle": "int64",    # Sliding tackle
#     "goalkeeping_diving": "int64",          # Goalkeeping diving
#     "goalkeeping_handling": "int64",        # Goalkeeping handling
#     "goalkeeping_kicking": "int64",         # Goalkeeping kicking
#     "goalkeeping_positioning": "int64",     # Goalkeeping positioning
#     "goalkeeping_reflexes": "int64",        # Goalkeeping reflexes
#     "goalkeeping_speed": "int64",         # Goalkeeping speed
#     "on_loan": "int64",                   # On loan status (0 or 1, float to handle NaN)
#     "age_group": "str",                     # Age group
#     "overall_range": "str",                 # Overall rating range
# }
