In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import networkx as nx
import folium
from folium.plugins import HeatMap, MarkerCluster
from matplotlib.animation import FuncAnimation, PillowWriter 
import geopandas as gpd
import matplotlib.pyplot as plt
from datetime import datetime 
from folium.plugins import TimestampedGeoJson 
import os
import pathpy as pp
import numpy as np
from collections import defaultdict, deque
from pandas import Timestamp
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict, deque


# Web Scraping

Define Helper Functions

In [18]:
def retrieve_tables(tables,club_name):
    # create dictionary to store data from 1 club
    dct = {col:[] for col in ["In/Out","Name","Age","Nationality","Position","Market value","Left","Joined","Fee"]}

    # for in & out table in tables
    for i,table in enumerate(tables):
        # in or out table
        in_out = table.find('thead').find('th').text

        # find all rows of the table
        rows = table.find('tbody').findAll('tr')

        if ("No new arrivals" in rows[0].text)|("No departures" in rows[0].text):
            return dct 

        # for each row in table
        for j,row in enumerate(rows):
            # retrieve info from that row and store it in dct
            dct = retrieve_row(row,dct,in_out,club_name)
    
    return dct
    

def retrieve_row(row,dct,in_out,club_name):
    
    row.find('')

    # get a list of all row_items
    row_items=row.findAll('td')

    # retrieve name information
    name_info = row_items[0].find('a')
    # print(row_items[0])
    name = (name_info.text,name_info.get('href'))

    # retrieve age
    age = row_items[1].text

    # retrieve Nationality
    nationalities = [(country.get("alt"),country.get("src")) for country in row_items[2].findAll('img')]

    # retrieve position
    position = (row_items[3].text,row_items[4].text)

    # retrieve market value
    market_value = row_items[5].text

    # left
    # if "Without Club" in row_items[7].text:
    #     other_club = ("Without Club","Without Club")
    # elif "Retired" in row_items[7].text:
    #     other_club = ("Retired","Retired")
    # elif "Unknown" in row_items[7].text:
    #     other_club = ("Unknown","Unknown")
    # elif "Career break" in row_items[7].text:
    #     other_club = ("Career break","Career break")
    # else:
    # the try  
    try:
        other_club = (row_items[6].find('a').get('title'),row_items[7].find('img').get('title')) # (club_name,country_of_club)
    except:
        txt = "Other: "+row_items[7].text
        other_club = (txt,txt)

    # fee
    fee = row_items[8].text

    dct["In/Out"].append(in_out)
    dct["Name"].append(name)
    dct["Age"].append(age)
    dct["Nationality"].append(nationalities)
    dct["Position"].append(position)
    dct["Market value"].append(market_value)
    dct["Fee"].append(fee)
    if in_out == "In":
        dct["Left"].append(other_club)
        dct["Joined"].append(club_name)
    elif in_out == "Out":
        dct["Left"].append(club_name)
        dct["Joined"].append(other_club)

    return dct

Open URL & Load Soup Object

In [19]:
URL = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1?saison_id=1992"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(URL, headers=headers)

soup = BeautifulSoup(response.text)

Create Data

In [20]:
# # define main container with all tables (In & Out) of all clubs
# main_container = soup.find('div',class_='large-8 columns')

# # select all div's with a box in the main_container
# boxes = main_container.findAll('div',class_='box')


# # Create data for every club
# transfer_data = {}
                                            
# # Loop over all boxes:                     #only want boxes with a h2 object that has the class named "content-box-headline content-box-headline--inverted content-box-headline--logo", because the first box is the header where data can be selected and the second box is the logo list and the third box is the sub box of the second box class
# for h,box in enumerate(boxes):
#     h2_box = box.find('h2',class_="content-box-headline content-box-headline--inverted content-box-headline--logo")

#     # ONLY PERFORM ACTIONS WHEN IT IS A VALID BOX
#     if h2_box != None: 
#         # retrieve club name
#         club_name = h2_box.find('a').get('title')
#         print(str(club_name))
#         # retrieve all tables in box
#         tables = box.findAll('table')
#         club_data = retrieve_tables(tables,club_name)

#         # append club_data to main data dictionary
#         transfer_data[club_name] = club_data

Middlesbrough FC
Wimbledon FC (- 2004)
Norwich City
Oldham Athletic
Queens Park Rangers
Sheffield Wednesday
Coventry City
Manchester United
Crystal Palace
Nottingham Forest
Ipswich Town
Arsenal FC
Chelsea FC
Aston Villa
Leeds United
Sheffield United
Manchester City
Southampton FC
Blackburn Rovers
Tottenham Hotspur
Liverpool FC
Everton FC


Retrieve per Year

In [21]:
def retrieve_page(transfer_data,year,headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}):
    URL = f"https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id={year}"

    response = requests.get(URL, headers=headers)

    soup = BeautifulSoup(response.text)

    # Define main container with all tabels (In & Out) of all clubs
    main_container = soup.find('div',class_='large-8 columns')

    # select all div's with a box in the main_container
    boxes = main_container.findAll('div',class_='box')

    # Loop over all boxes:                     # only want boxes with a h2 object that has the class named "content-box-headline content-box-headline--inverted content-box-headline--logo", because the first box is the header where data can be selected and the second box is the logo list and the third box is the sub box of the second box class
    for i,box in enumerate(boxes):
        h2_box = box.find('h2',class_="content-box-headline content-box-headline--inverted content-box-headline--logo")

        # ONLY PERFORM ACTIONS WHEN IT IS A VALID BOX
        if h2_box != None: 
            # retrieve club name
            club_name = h2_box.find('a').get('title')
            # retrieve all tables in box
            tables = box.findAll('table')
            club_data = retrieve_tables(tables,club_name)   

            # append club_data to main data dictionary
            transfer_data[f"{club_name}_{year}"] = club_data
    
    return transfer_data

In [26]:
# Creëer data for every club
transfer_data = {}

for year in tqdm(range(1992,2025)):
    # print(year)
    transfer_data = retrieve_page(transfer_data,year)

100%|██████████| 33/33 [06:23<00:00, 11.63s/it]


# Data Preprocessing

In [23]:
data_frames=[]
for k in transfer_data.keys():
    # Extract club and year from the key
    club, year = k.split("_")
    
    # Create new data with additional columns for year and club
    num_rows = len(transfer_data[k]["In/Out"])
    new_data = {
        **transfer_data[k],
        "Year": [year] * num_rows,
        "Club table": [club] * num_rows
    }
    
    # Convert the new data to a DataFrame and append it to the list
    df = pd.DataFrame(new_data)
    data_frames.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
final_df = pd.concat(data_frames, ignore_index=True)

final_df=pd.read_pickle("\Data\transfers.pkl")

final_df['Name'] = final_df['Name'].apply(lambda x: x[0] if isinstance(x,tuple) else x)   # removed hyperlink
final_df['Nationality'] = final_df['Nationality'].apply(lambda x: [item[0] for item in x] if isinstance(x, list) else x)    # removed hyperlink
final_df['Left'] = [i if type(i)==tuple else (i,"England") for i in final_df["Left"]]
final_df['Joined'] = [i if type(i)==tuple else (i,"England")for i in final_df['Joined']]
final_df = final_df[['Name', 'Age', 'Nationality', 'Position', 'Market value', 'Left', 'Joined', 'Fee', 'Year']]

In [None]:
# Transfer to Pickle file after preprocessing 
final_df.to_pickle("\Data\transfer.pkl")

# Summary Statistics

In [None]:
# Replace '-' with NaN
final_df.replace('-', np.nan, inplace=True)

# Compute summary statistics
summary_stats = final_df.describe(include='all')

# Calculate frequency of most common values for categorical columns
freq_stats = {}
for column in final_df.columns:
    if final_df[column].dtype == 'object':
        most_common = final_df[column].value_counts().idxmax()
        freq_stats[column] = (most_common, final_df[column].value_counts().max())
    else:
        freq_stats[column] = (None, None)

# Print summary statistics
print("Summary Statistics")
print(summary_stats)

# Print most common value and frequency for categorical columns
print("\nMost Common Values and Frequencies")
for col, (val, freq) in freq_stats.items():
    if val is not None:
        print(f"{col}: Most common value = {val}, Frequency = {freq}")


Summary Statistics
                Name    Age Nationality              Position Market value  \
count          24950  24920       24950                 24950        11169   
unique          6207     35         658                    17          194   
top     David Button     21   [England]  (Centre-Forward, CF)       €1.00m   
freq              30   2564        8189                  4940          549   

                         Left                 Joined            Fee   Year  
count                   24950                  24950          22793  24950  
unique                   1142                   1037           2462     33  
top     (Chelsea FC, England)  (Chelsea FC, England)  loan transfer   2014  
freq                      831                    820           5917   1023  

Most Common Values and Frequencies
Name: Most common value = David Button, Frequency = 30
Age: Most common value = 21, Frequency = 2564
Nationality: Most common value = ['England'], Frequency = 8189
Posit

# Temporal Network 

In [None]:
import networkx as nx
import pandas as pd
from pyvis.network import Network
from datetime import datetime

final_df = pd.read_pickle("\Data\transfer.pkl")
# Convert 'Year' to datetime
final_df['Year'] = pd.to_datetime(final_df['Year'], format='%Y')

# Create a directed multigraph to capture all transfers
G = nx.MultiDiGraph()

# Add nodes and edges
for index, row in final_df.iterrows():
    left_club, left_country = row['Left']
    joined_club, joined_country = row['Joined']
    timestamp = row['Year'].strftime('%Y-%m-%d')
    
    # Add nodes with attributes
    if not G.has_node(left_club):
        G.add_node(left_club, country=left_country)
    if not G.has_node(joined_club):
        G.add_node(joined_club, country=joined_country)
    
    # Add edge with attributes ensuring unique key
    G.add_edge(
        left_club,
        joined_club,
        key=(timestamp, row['Name']),  # Use timestamp and player to ensure uniqueness
        name=row['Name'],
        age=row['Age'],
        nationality=row['Nationality'],
        position=row['Position'],
        market_value=row['Market value'],
        fee=row['Fee'],
        timestamp=timestamp
    )

# Display the number of nodes and edges
print(f'Nodes: {G.number_of_nodes()}')
print(f'Edges: {G.number_of_edges()}')


Nodes: 1395
Edges: 22818


In [None]:
import networkx as nx
import numpy as np
import pandas as pd
from datetime import datetime

# Assuming G is your MultiDiGraph created from the previous code
# G = nx.MultiDiGraph() # Already created

# Compute the number of nodes
num_nodes = G.number_of_nodes()

# Compute the number of edges
num_edges = G.number_of_edges()

# Compute the average number of links per node
links_per_node = num_edges / num_nodes if num_nodes > 0 else 0

# Extract all timestamps from the edges
timestamps = []
for _, _, edge_data in G.edges(data=True):
    if 'timestamp' in edge_data:
        timestamps.append(datetime.strptime(edge_data['timestamp'], '%Y-%m-%d'))

# Convert timestamps to numeric values (using ordinal dates for simplicity)
timestamps = np.array([ts.toordinal() for ts in timestamps])

# Calculate observation period
if len(timestamps) > 0:
    observation_period = (timestamps.min(), timestamps.max())
else:
    observation_period = (0, 0)

# Calculate observation length
observation_length = observation_period[1] - observation_period[0]

# Count unique time stamps
unique_time_stamps = len(np.unique(timestamps))

# Calculate inter-event times
sorted_timestamps = np.sort(timestamps)
inter_event_times = np.diff(sorted_timestamps)

# Calculate average, min, and max inter-event times
avg_inter_event_dt = np.mean(inter_event_times) if len(inter_event_times) > 0 else 0
min_inter_event_dt = np.min(inter_event_times) if len(inter_event_times) > 0 else 0
max_inter_event_dt = np.max(inter_event_times) if len(inter_event_times) > 0 else 0

# Print results
print("Attribute Value")
print(f"Nodes: {num_nodes}")
print(f"Time-stamped links: {num_edges}")
print(f"Links/Nodes: {links_per_node:.2f}")
print(f"Observation period: ({observation_period[0]}, {observation_period[1]})")
print(f"Observation length: {observation_length} days")
print(f"Unique time stamps: {unique_time_stamps}")
print(f"Avg. inter-event dt: {avg_inter_event_dt:.2f} years")
print(f"Min/Max inter-event dt: {min_inter_event_dt} / {max_inter_event_dt} days")


Attribute Value
Nodes: 1395
Time-stamped links: 22818
Links/Nodes: 16.36
Observation period: (727198, 738886)
Observation length: 11688 days
Unique time stamps: 33
Avg. inter-event dt: 0.51 years
Min/Max inter-event dt: 0 / 366 days


In [None]:
# Define a function to check if a club is in the Premier League
def is_premier_league(club_country):
    return club_country[1] == 'England'

# Extract club names and years for 'Joined' and 'Left'
joined_transfers = final_df[['Joined', 'Year']].rename(columns={'Joined': 'Club'})
left_transfers = final_df[['Left', 'Year']].rename(columns={'Left': 'Club'})

# Concatenate both DataFrames to get all transfers involving all clubs
all_transfers = pd.concat([joined_transfers, left_transfers])

# Extract club names (first element of the tuple)
all_transfers['Club'] = all_transfers['Club'].apply(lambda x: x[0])

# Calculate total average number of transfers per year per club for all clubs
transfers_per_club_per_year_all = all_transfers.groupby(['Club', 'Year']).size().reset_index(name='Transfers')
average_transfers_per_club_per_year_all = transfers_per_club_per_year_all.groupby('Club')['Transfers'].mean().reset_index(name='Average Transfers per Year')
overall_average_transfers_per_year_all = average_transfers_per_club_per_year_all['Average Transfers per Year'].mean()

# Filter for Premier League clubs in 'Joined' or 'Left'
premier_league_transfers = final_df[
    final_df['Joined'].apply(is_premier_league) | final_df['Left'].apply(is_premier_league)
]

# Extract club names and years for Premier League clubs
joined_transfers_prem = premier_league_transfers[['Joined', 'Year']].rename(columns={'Joined': 'Club'})
left_transfers_prem = premier_league_transfers[['Left', 'Year']].rename(columns={'Left': 'Club'})

# Concatenate both DataFrames for Premier League clubs
all_transfers_prem = pd.concat([joined_transfers_prem, left_transfers_prem])

# Extract Premier League club names (first element of the tuple)
all_transfers_prem['Club'] = all_transfers_prem['Club'].apply(lambda x: x[0])

# Calculate total average number of transfers per year per club for Premier League clubs
transfers_per_club_per_year_prem = all_transfers_prem.groupby(['Club', 'Year']).size().reset_index(name='Transfers')
average_transfers_per_club_per_year_prem = transfers_per_club_per_year_prem.groupby('Club')['Transfers'].mean().reset_index(name='Average Transfers per Year')
overall_average_transfers_per_year_prem = average_transfers_per_club_per_year_prem['Average Transfers per Year'].mean()

# Print the results
print(f"Overall average number of transfers per year per club (All Clubs): {overall_average_transfers_per_year_all:.2f}")
print(f"Overall average number of transfers per year per club (Premier League): {overall_average_transfers_per_year_prem:.2f}")


Overall average number of transfers per year per club (All Clubs): 2.40
Overall average number of transfers per year per club (Premier League): 4.46


# Metrics

In [12]:
# Open pickle file, change year to datetime, and filter for year 2021 and 2022
final_df = pd.read_pickle("transfer.pkl")
final_df['Year'] = pd.to_datetime(final_df['Year'], format='%Y')
filtered_df = final_df[final_df['Year'].dt.year.isin([2021, 2022])]

# Create graph
G = nx.MultiDiGraph()
for index, row in filtered_df.iterrows():
    left_club = (row['Left'][0], row['Left'][1])
    joined_club = (row['Joined'][0], row['Joined'][1])
    timestamp = row['Year'].timestamp()
    
    if not G.has_node(left_club):
        G.add_node(left_club, country=row['Left'][1])
    if not G.has_node(joined_club):
        G.add_node(joined_club, country=row['Joined'][1])
    
    G.add_edge(
        left_club,
        joined_club,
        key=(timestamp, row['Name']),
        name=row['Name'],
        age=row['Age'],
        nationality=row['Nationality'],
        position=row['Position'],
        market_value=row['Market value'],
        fee=row['Fee'],
        Year=timestamp
    )

# Display the number of nodes and edges
print(f'Nodes: {G.number_of_nodes()}')
print(f'Edges: {G.number_of_edges()}')

Nodes: 375
Edges: 1372


## Temporal Degree

In [3]:
# Temporal Degree
def temporal_degree(G):
    temporal_degree_dict = defaultdict(int)
    for u, v, data in G.edges(data=True):
        temporal_degree_dict[u] += 1
        if not G.is_directed():
            temporal_degree_dict[v] += 1
    return temporal_degree_dict

In [4]:
# Calculate the temporal degree for all nodes
degree_dict = temporal_degree(G)

# Filter nodes for Premier League clubs (country is 'England')
premier_league_degrees = {node: degree for node, degree in degree_dict.items() if node[1] == 'England'}

# Calculate the average temporal degree for Premier League clubs
if premier_league_degrees:  # Check if there are any Premier League nodes
    avg_premier_league_degree = sum(premier_league_degrees.values()) / len(premier_league_degrees)
else:
    avg_premier_league_degree = 0  # Handle case with no 'England' clubs

# Get top 10 nodes overall
top_10 = sorted(degree_dict.items(), key=lambda item: item[1], reverse=True)[:10]

# Calculate the average degree overall
average_degree = sum(degree_dict.values()) / len(degree_dict) if degree_dict else 0

# Output the results
print("Top 10 nodes with the highest temporal degrees:")
for node, degree in top_10:
    print(f"Node: {node}, Temporal Degree: {degree}")

print(f"Average Temporal Degree: {average_degree}")
print(f"Average Temporal Degree for Premier League clubs: {avg_premier_league_degree}")


Top 10 nodes with the highest temporal degrees:
Node: ('Brighton & Hove Albion', 'England'), Temporal Degree: 50
Node: ('Wolverhampton Wanderers', 'England'), Temporal Degree: 50
Node: ('Chelsea FC', 'England'), Temporal Degree: 48
Node: ('Manchester City', 'England'), Temporal Degree: 47
Node: ('Aston Villa', 'England'), Temporal Degree: 40
Node: ('Arsenal FC', 'England'), Temporal Degree: 38
Node: ('Brentford FC', 'England'), Temporal Degree: 35
Node: ('Norwich City', 'England'), Temporal Degree: 33
Node: ('Everton FC', 'England'), Temporal Degree: 33
Node: ('Manchester United', 'England'), Temporal Degree: 33
Average Temporal Degree: 4.619528619528619
Average Temporal Degree for Premier League clubs: 9.504950495049505


## Temporal Betweenness and Shortest Paths

In [15]:
# Define find_temporal_shortest_paths function
def find_temporal_shortest_paths(G, source, target, time_attr):
    shortest_paths = []
    min_length = float('inf')
    queue = deque([(source, [source], -float('inf'))])
    visited = set()
    
    while queue:
        current, path, last_time = queue.popleft()
        
        if current == target:
            if len(path) < min_length:
                shortest_paths = [path]
                min_length = len(path)
            elif len(path) == min_length:
                shortest_paths.append(path)
            continue
        
        if (current, last_time) in visited:
            continue
        visited.add((current, last_time))
        
        neighbors = G.successors(current) if G.is_directed() else G.neighbors(current)
        
        for neighbor in neighbors:
            for key in G[current][neighbor]:
                edge_data = G[current][neighbor][key]
                edge_time = edge_data.get(time_attr)
                
                if edge_time is None:
                    continue
                
                if isinstance(edge_time, datetime):
                    edge_time = edge_time.timestamp()  # Convert to timestamp
                
                if edge_time > last_time:
                    queue.append((neighbor, path + [neighbor], edge_time))
    
    return shortest_paths

In [14]:
node_names = filtered_df['Joined'].unique()
end_node_names = filtered_df['Left'].unique()

for node_name in node_names:
    for node_name2 in end_node_names:
        if node_name != node_name2:
            print(f"Shortest paths from {node_name} to {node_name2}:")
            paths = find_temporal_shortest_paths(G, tuple(node_name), tuple(node_name2), 'timestamp')
            print(paths)

Shortest paths from ('Arsenal FC', 'England') to ('Brighton & Hove Albion', 'England'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Real Madrid', 'Spain'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Sheffield United', 'England'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Bologna FC 1909', 'Italy'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('RSC Anderlecht', 'Belgium'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('SL Benfica', 'Portugal'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Colorado Rapids', 'United States'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Arsenal FC U23', 'England'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('OGC Nice', 'France'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('FC Schalke 04', 'Germany'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Hertha BSC', 'Germany'):
[]
Shortest paths from ('Arsenal FC', 'England') to ('Atlético de Madrid', 'Spain'):
[]
Shorte

In [7]:
def temporal_betweenness(G, time_attr='time'):
    nodes = list(G.nodes())
    N = len(nodes)
    betweenness = {node: 0 for node in nodes}
    
    def process_pair(s, t):
        if s == t:
            return
        
        paths = find_temporal_shortest_paths(G, s, t, time_attr)
        if not paths:
            return
        
        sigma_st = len(paths)
        path_count = defaultdict(int)
        
        for path in paths:
            for i in range(1, len(path) - 1):
                path_count[path[i]] += 1
        
        for node in path_count:
            betweenness[node] += path_count[node] / sigma_st
    
    with ThreadPoolExecutor() as executor:
        for s in nodes:
            for t in nodes:
                executor.submit(process_pair, s, t)
    
    scale = 1 / ((N - 1) * (N - 2))
    for node in betweenness:
        betweenness[node] *= scale
    
    return betweenness

In [8]:
# Assuming temporal_betweenness function is defined and works correctly
betweenness_dict = temporal_betweenness(G, time_attr='Year')

# Filter nodes for Premier League clubs (country is 'England')
premier_league_betweenness = {node: betweenness for node, betweenness in betweenness_dict.items() if node[1] == 'England'}

# Calculate the average betweenness for Premier League clubs
if premier_league_betweenness:  # Check if there are any Premier League nodes
    avg_premier_league_betweenness = sum(premier_league_betweenness.values()) / len(premier_league_betweenness)
else:
    avg_premier_league_betweenness = 0  # Handle case with no 'England' clubs

# Get top 10 nodes overall
top_10_betweenness = sorted(betweenness_dict.items(), key=lambda item: item[1], reverse=True)[:10]

# Calculate the average betweenness overall
average_betweenness = sum(betweenness_dict.values()) / len(betweenness_dict) if betweenness_dict else 0

# Output the results
print("Top 10 nodes with the highest temporal betweenness:")
for node, betweenness in top_10_betweenness:
    print(f"Node: {node}, Betweenness: {betweenness}")

print(f"Average Temporal Betweenness: {average_betweenness}")
print(f"Average Temporal Betweenness for Premier League clubs: {avg_premier_league_betweenness}")

# Top 10 as a dictionary
top_10_betweenness_dict = dict(top_10_betweenness)


Top 10 nodes with the highest temporal betweenness:
Node: ('Wolverhampton Wanderers', 'England'), Betweenness: 0.003412137460394833
Node: ('Brighton & Hove Albion', 'England'), Betweenness: 0.0029736730178300907
Node: ('Chelsea FC', 'England'), Betweenness: 0.0028303058976454337
Node: ('Brentford FC', 'England'), Betweenness: 0.0025029509732238005
Node: ('Manchester City', 'England'), Betweenness: 0.0024509803921568623
Node: ('Aston Villa', 'England'), Betweenness: 0.0022293587188714136
Node: ('Everton FC', 'England'), Betweenness: 0.0014954385361261246
Node: ('Arsenal FC', 'England'), Betweenness: 0.0013846874357834773
Node: ('Manchester United', 'England'), Betweenness: 0.0013757269907719361
Node: ('Leeds United', 'England'), Betweenness: 0.0011308081604564809
Average Temporal Betweenness: 8.25221143782885e-05
Average Temporal Betweenness for Premier League clubs: 0.00028434813806530585


## Temporal Closeness

In [9]:
# Good
# Define temporal_closeness function
def temporal_closeness(G, time_attr='Year'):
    closeness = {}
    nodes = list(G.nodes())
    N = len(nodes)
    
    for s in nodes:
        distance = {node: float('inf') for node in nodes}
        distance[s] = 0
        Q = deque([(s, 0)])  # (node, current_time)
        
        while Q:
            current_node, current_time = Q.popleft()
            neighbors = G.successors(current_node) if G.is_directed() else G.neighbors(current_node)
            
            for neighbor in neighbors:
                for key in G[current_node][neighbor]:
                    edge_data = G[current_node][neighbor][key]
                    edge_time = edge_data.get(time_attr)
                    
                    if edge_time is None:
                        print(f"Missing '{time_attr}' for edge ({current_node}, {neighbor}, {key})")
                        continue
                    
                    if isinstance(edge_time, datetime):
                        edge_time = edge_time.timestamp()  # Convert Timestamp to seconds
                    
                    if current_time <= edge_time < distance[neighbor]: 
                        distance[neighbor] = edge_time
                        Q.append((neighbor, edge_time))
        
        total_reciprocal_distance = sum([1/d for d in distance.values() if d != float('inf') and d != 0])
        reachable_nodes = len([d for d in distance.values() if d != float('inf') and d != 0])
        
        if reachable_nodes > 0:
            closeness[s] = total_reciprocal_distance / (N - 1)
        else:
            closeness[s] = 0
    
    return closeness

In [11]:
closeness_dict = temporal_closeness(G, time_attr='Year')

# Filter nodes for Premier League clubs (nodes located in England)
premier_league_closeness = {node: closeness for node, closeness in closeness_dict.items() if node[1] == 'England'}

# Calculate the average closeness for Premier League clubs
if premier_league_closeness:
    avg_premier_league_closeness = sum(premier_league_closeness.values()) / len(premier_league_closeness)
else:
    avg_premier_league_closeness = 0

# Get the top 10 Premier League nodes by closeness
top_10_premier_league_closeness = sorted(premier_league_closeness.items(), key=lambda item: item[1], reverse=True)[:10]

# Output the results for Premier League clubs
print("Top 10 Premier League nodes with the highest temporal closeness:")
for node, closeness in top_10_premier_league_closeness:
    print(f"Node: {node}, Closeness: {closeness:.12e}")

print(f"Average Temporal Closeness for Premier League clubs: {avg_premier_league_closeness:.12e}")

# Store top 10 Premier League nodes as a dictionary
top_10_premier_league_closeness_dict = dict(top_10_premier_league_closeness)

# Calculate the average closeness for all clubs
if closeness_dict:
    avg_all_clubs_closeness = sum(closeness_dict.values()) / len(closeness_dict)
else:
    avg_all_clubs_closeness = 0

# Output the average closeness for all clubs
print(f"Average Temporal Closeness for all clubs: {avg_all_clubs_closeness:.12e}")


Top 10 Premier League nodes with the highest temporal closeness:
Node: ('Arsenal FC U23', 'England'), Closeness: 4.606935764829e-10
Node: ('Aston Villa U23', 'England'), Closeness: 4.606935764829e-10
Node: ('Brentford FC B', 'England'), Closeness: 4.606935764829e-10
Node: ('Brighton & Hove Albion U23', 'England'), Closeness: 4.606935764829e-10
Node: ('Burnley FC U21', 'England'), Closeness: 4.606935764829e-10
Node: ('Fleetwood Town', 'England'), Closeness: 4.606935764829e-10
Node: ('Leeds United U21', 'England'), Closeness: 4.606935764829e-10
Node: ('Leicester City U21', 'England'), Closeness: 4.606935764829e-10
Node: ('Liverpool FC U23', 'England'), Closeness: 4.606935764829e-10
Node: ('Manchester City U23', 'England'), Closeness: 4.606935764829e-10
Average Temporal Closeness for Premier League clubs: 3.964377446515e-10
Average Temporal Closeness for all clubs: 3.258230106914e-10
