# Paris 2024 Volleyball Predictions

## Environment setup

In [166]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import random

## Data setup

In [115]:
# Define the list of qualified teams
men_qualified = ['France', 'Germany', 'Brazil', 'United States of America', 'Japan', 'Poland', 'Canada']
women_qualified = ['France', 'Dominican Republic', 'Serbia', 'Turkey', 'Brazil', 'United States of America', 'Poland']

# Define the list of countries not yet qualified but ordered by world rankings
men_not_qualified = ['Italy', 'Argentina', 'Slovenia', 'Serbia', 'Cuba']
women_not_qualified = ['China', 'Italy', 'Japan', 'Netherlands', 'Canada']

# Create a dictionary with country codes as keys and country names as values
country_codes = {
    "FRA": "France",
    "GER": "Germany",
    "BRA": "Brazil",
    "USA": "United States of America",
    "JPN": "Japan",
    "POL": "Poland",
    "CAN": "Canada",
    "SRB": "Serbia",
    "TUR": "Turkey",
    "ITA": "Italy",
    "ARG": "Argentina",
    "SLO": "Slovenia",
    "CUB": "Cuba",
    "CHN": "China",
    "NED": "Netherlands",
    "DOM": "Dominican Republic"
}

# Create a dictionary with country names as keys and country codes as values
country_names = {
    "France": "FRA",
    "Germany": "GER",
    "Brazil": "BRA",
    "United States of America": "USA",
    "Japan": "JPN",
    "Poland": "POL",
    "Canada": "CAN",
    "Serbia": "SRB",
    "Turkey": "TUR",
    "Italy": "ITA",
    "Argentina": "ARG",
    "Slovenia": "SLO",
    "Cuba": "CUB",
    "China": "CHN",
    "Netherlands": "NED",
    "Dominican Republic": "DOM"
}

# Define the entire list of countries (both qualified and not yet qualified)
men_teams = men_qualified + men_not_qualified
women_teams = women_qualified + women_not_qualified

### Matches Outcome Probabilities

In [129]:
# Read the CSV files
men_matches = pd.read_csv('matches_men.csv')
women_matches = pd.read_csv('matches_women.csv')

# Create empty DataFrames for the probabilities
men_probs = pd.DataFrame()
women_probs = pd.DataFrame()

# Duplicate the matches for symetric data (match (A,B) = match (B,A))
old_index = men_matches.index
for id in old_index:
    date = men_matches.loc[id, "Date"]
    team1 = men_matches.loc[id, "Team1"]
    team2 = men_matches.loc[id, "Team2"]
    points_team1 = men_matches.loc[id, "Points_Team1"]
    points_team2 = men_matches.loc[id, "Points_Team2"]
    men_matches.loc[len(men_matches.index)] = [date, team2, team1, points_team2, points_team1]

old_index = women_matches.index
for id in old_index:
    date = women_matches.loc[id, "Date"]
    team1 = women_matches.loc[id, "Team1"]
    team2 = women_matches.loc[id, "Team2"]
    points_team1 = women_matches.loc[id, "Points_Team1"]
    points_team2 = women_matches.loc[id, "Points_Team2"]
    women_matches.loc[len(women_matches.index)] = [date, team2, team1, points_team2, points_team1]

# Get the country codes of all the participating teams
men_codes = [country_names[country] for country in men_teams]
women_codes = [country_names[country] for country in women_teams]

# Calculate the win probabilities for each matchup in the men's matches
for team_a in men_codes:
    for team_b in men_codes:
        # Country codes
        country_a = country_codes[team_a]
        country_b = country_codes[team_b]

        # The diagonal
        if team_a == team_b:
          men_probs.loc[country_a, country_b] = 0
          continue

        # Get all the matches between team_a and team_b
        matches = men_matches[(men_matches['Team1'] == team_a) & (men_matches['Team2'] == team_b)]

        # Calculate the win rate of team_a
        win_rate = (matches['Points_Team1'].sum() + 1) / (matches['Points_Team1'].sum() + matches['Points_Team2'].sum() + 1)

        # Calculate the weighted average of the win rates
        weighted_avg = 0
        for _, match in matches.iterrows():
            if match['Points_Team1'] == 3:
                weight = 3
            elif match['Points_Team1'] == 2:
                weight = 2
            else:
                weight = 1
            weighted_avg += win_rate * weight

        # print(team_a, team_b, matches.shape[0])

        weighted_avg /= (matches.shape[0]+1)

        # Normalize the probabilities
        prob = weighted_avg * 0.5

        # Add the probability to the DataFrame
        if matches.shape[0] == 0:
          men_probs.loc[country_a, country_b] = 0.5
        else:
          if prob>1:
            prob=1
          men_probs.loc[country_a, country_b] = prob

# Calculate the win probabilities for each matchup in the women's matches
for team_a in women_codes:
    for team_b in women_codes:
        # Country codes
        country_a = country_codes[team_a]
        country_b = country_codes[team_b]

        # The diagonal
        if team_a == team_b:
          women_probs.loc[country_a, country_b] = 0
          continue

        # Get all the matches between team_a and team_b
        matches = women_matches[(women_matches['Team1'] == team_a) & (women_matches['Team2'] == team_b)]

        # Calculate the win rate of team_a
        win_rate = (matches['Points_Team1'].sum() + 1) / (matches['Points_Team1'].sum() + matches['Points_Team2'].sum() + 1)

        # Calculate the weighted average of the win rates
        weighted_avg = 0
        for _, match in matches.iterrows():
            if match['Points_Team1'] == 3:
                weight = 3
            elif match['Points_Team1'] == 2:
                weight = 2
            else:
                weight = 1
            weighted_avg += win_rate * weight

        weighted_avg /= (matches.shape[0]+1)

        # Normalize the probabilities
        prob = weighted_avg * 0.5

        # Add the probability to the DataFrame
        if matches.shape[0] == 0:
          women_probs.loc[country_a, country_b] = 0.5
        else:
          if prob>1:
            prob=1
          women_probs.loc[country_a, country_b] = prob

In [130]:
men_probs

Unnamed: 0,Poland,Slovenia,Italy,Serbia,Argentina,Germany,Brazil,Canada,Cuba,United States of America,France,Japan
Poland,0.0,0.611111,0.733333,0.625,0.875,0.5,0.917647,0.9375,0.5,0.541667,0.777778,0.875
Slovenia,0.311111,0.0,0.1,1.0,0.777778,0.875,0.115385,0.75,0.6,0.083333,0.6,0.068182
Italy,0.2,1.0,0.0,1.0,0.50625,0.777143,0.5,0.777778,0.538462,0.047619,0.373333,0.466667
Serbia,0.30625,0.068182,0.057143,0.0,0.240385,0.642857,0.0625,0.5,0.5,0.09375,0.111111,0.083333
Argentina,0.083333,0.111111,0.4,0.692308,0.0,0.875,0.416667,0.538462,0.5,0.642857,0.510417,0.2
Germany,0.25,0.083333,0.257143,0.267857,0.083333,0.0,0.225,0.875,0.875,0.1,0.1,0.1
Brazil,0.147059,0.865385,0.333333,0.75,0.545455,0.6875,0.0,0.5625,0.6,0.37037,0.333333,0.75
Canada,0.125,0.0625,0.166667,0.25,0.288462,0.083333,0.35,0.0,0.555556,0.22,0.047619,0.111111
Cuba,0.5,0.1,0.288462,0.25,0.5,0.083333,0.35,0.4,0.0,0.28,0.0625,0.083333
United States of America,0.392857,0.875,1.0,0.9375,0.321429,0.6,0.37037,0.81,0.666667,0.0,0.694737,0.636364


In [165]:
women_probs

Unnamed: 0,United States of America,Canada,Japan,Serbia,Brazil,Italy,China,Poland,Dominican Republic,Netherlands,France,Turkey
United States of America,0.0,1.0,0.5,0.240385,0.75,0.471154,0.25,0.272222,0.795455,0.5,0.5,0.647059
Canada,0.047619,0.0,0.5,0.296296,0.333333,0.25,0.296296,0.30303,0.293478,0.466667,0.5,0.047619
Japan,0.333333,0.5,0.0,0.25,0.378947,0.111111,0.125,0.5,0.6,0.416667,0.5,0.466667
Serbia,0.692308,0.555556,0.5,0.0,0.403846,0.5,0.37037,0.625,0.329412,0.875,0.875,0.403846
Brazil,0.0625,0.5,0.578947,0.471154,0.0,0.555556,0.375,0.5,0.6,1.0,0.5,0.047619
Italy,0.403846,0.5,0.777778,0.5,0.4,0.0,0.865385,0.09375,0.5,0.471154,0.75,0.125
China,0.5,0.555556,0.9375,0.37037,0.5,0.153846,0.0,0.380952,0.2,0.6,0.5,0.1
Poland,0.733333,0.636364,0.5,0.30625,0.5,0.9375,0.380952,0.0,0.37037,0.0625,0.5,0.375
Dominican Republic,0.242424,0.76087,0.1,0.588235,0.1,0.25,0.7,0.37037,0.0,0.636364,0.5,0.1
Netherlands,0.5,0.466667,0.333333,0.083333,0.047619,0.403846,0.35,0.75,0.30303,0.0,0.6,0.0625


## Simulations

### Preliminary Phase

In [5]:
def simulate_preliminary_phase_pool(teams, probs):
  """Simulates the preliminary phase of a single pool of the tournament.

  Args:
    teams: The list of teams in the pool (4 teams).
    probs: The probabilities of each country winning against another.

  Returns:
    A dataframe with the results of the pool.
  """

  # Initialize the results dataframe
  results = pd.DataFrame(index=teams, columns=teams)

  # Iterate over the teams
  for team1 in teams:

    # Iterate over the other teams
    for team2 in teams:

      # If the teams are the same, then the result is a tie
      if team1 == team2:
        results.loc[team1, team2] = 0
        # results.loc[team1, team2] = 0.5
        # results.loc[team2, team1] = 0.5
      else:
        # Otherwise, the result is determined by the probabilities
        result = np.random.choice([0, 1], p=[1 - probs.loc[team1, team2], probs.loc[team1, team2]])
        results.loc[team1, team2] = result
        results.loc[team2, team1] = 1 - result

  return results

In [6]:
def form_pools(teams):
  """Partitions a list of 12 teams into 3 lists (pools) of 4 random teams each.

  Args:
    teams: A list of 12 teams.

  Returns:
    A list of 3 lists (pools) of 4 random teams each.
  """

  # Shuffle the list of teams.
  random.shuffle(teams)

  # Partition the list of teams into 3 lists of 4 teams each.
  partitions = []
  for i in range(0, len(teams), 4):
    partitions.append(teams[i:i+4])

  return partitions

In [7]:
def get_pool_ranking(pool_result):
  """
  Returns the ranking of the teams of a given pool based on the Teams Combined Ranking System.

  Args:
    pool_result: A dataframe that contains the results of the pool.

  Returns:
    A list of teams in ascending order of ranking.
  """

  pool_df = pd.DataFrame({'Team': [pool_result.loc[team, :] for team in pool_result],
                          'Pool Position': [0] * len(pool_result),
                          '# of Victories': [pool_result.loc[team, :].sum() for team in pool_result],
                          'Points Gained': [None] * len(pool_result),
                          'Sets Ratio': [None] * len(pool_result),
                          'Points Ratio': [None] * len(pool_result),
                          'Head-to-head': [None] * len(pool_result),
                          'Highest team in the World Ranking': [None] * len(pool_result)})

  pool_places = pool_df.sort_values(by=['# of Victories'], ascending=False).index[0:4]
  pool_ranking = []
  for i in pool_places:
    pool_ranking += [pool_result.columns[i]]

  return pool_ranking

In [8]:
def get_final_phase_bracket(pool_results):
  """
  Returns the Final Phase bracket based on the Teams Combined Ranking System.

  Args:
    pool_results: A list of pandas dataframes, where each dataframe contains the results of one pool.

  Returns:
    A pandas dataframe representing the Final Phase bracket.
  """

  # Initialize the Final Phase bracket.
  final_phase_bracket = pd.DataFrame(index=range(8), columns=['Team'])

  pool_rankings = []

  for pool_result in pool_results:
    pool_rankings += [get_pool_ranking(pool_result)]

  teams_info = {}
  all_teams = []

  for pool_ranking in pool_rankings:
    for i in range(0,4):
      team = pool_ranking[i]
      all_teams += [team]
      teams_info[team] = {}
      teams_info[team]['Pool Position'] = i

  for pool_result in pool_results:
    for team in pool_result.columns:
      teams_info[team]['# of Victories'] = pool_result.loc[team, :].sum()

  final_phase_bracket_df = pd.DataFrame({'Team' : [team for team in all_teams],
                                       'Pool Position': [teams_info[team]['Pool Position'] for team in all_teams],
                                       '# of Victories': [-teams_info[team]['# of Victories'] for team in all_teams],
                                       'Points Gained': [None] * len(all_teams),
                                       'Sets Ratio': [None] * len(all_teams),
                                       'Points Ratio': [None] * len(all_teams),
                                       'Head-to-head': [None] * len(all_teams),
                                       'Highest team in the World Ranking': [None] * len(all_teams)})

  final_phase_bracket = final_phase_bracket_df.sort_values(by=['Pool Position', '# of Victories'], ascending=True)['Team'].tolist()

  # Return the Final Phase bracket.
  return final_phase_bracket

In [9]:
def simulate_preliminary_phase(teams, probs):
  """Simulates the preliminary phase of the tournament.

  Args:
    teams: The list of teams (12).
    probs: The probabilities of each country winning against another.

  Returns:
    A dataframe with the results of the preliminary phase.
  """

  # Form the 3 pools.
  pools = form_pools(teams)

  # Get a list of dataframes with the results of the 3 pools.
  pool_results = []
  for pool in pools:
    pool_results.append(simulate_preliminary_phase_pool(pool, probs))

  # Get the Final Phase bracket.
  final_phase_bracket = get_final_phase_bracket(pool_results)

  # Return the Final Phase bracket.
  return final_phase_bracket

### Final Phase

In [58]:
def simulate_final_phase(preliminary_phase_results, probs):
  """Simulates the final phase of the tournament.

  Args:
    preliminary_phase_results: The results of the preliminary phase.
    probs: The probabilities of each country winning against another.

  Returns:
    A ranking of all the teams.
  """

  # Get the top 8 teams.
  top_8_teams = preliminary_phase_results[:8]
  bottom_4_teams = preliminary_phase_results[8:12]

  # Simulate the quarterfinals.
  quarterfinal_winners = []
  quarterfinal_losers = []
  for i in range(4):
    team1 = top_8_teams[i]
    team2 = top_8_teams[7 - i]
    if np.random.rand() < probs.loc[team1, team2]:
      quarterfinal_winners.append(team1)
      quarterfinal_losers.append(team2)
    else:
      quarterfinal_winners.append(team2)
      quarterfinal_losers.append(team1)

  # Simulate the semifinals.
  semifinal_winners = []
  semifinal_losers = []
  for i in range(2):
    team1 = quarterfinal_winners[i]
    team2 = quarterfinal_winners[3 - i]
    if np.random.rand() < probs.loc[team1, team2]:
      semifinal_winners.append(team1)
      semifinal_losers.append(team2)
    else:
      semifinal_winners.append(team2)
      semifinal_losers.append(team1)

  # Simulate the gold medal match.
  gold_medal_winner = None
  silver_medal_winner = None
  team1 = semifinal_winners[0]
  team2 = semifinal_winners[1]
  if np.random.rand() < probs.loc[team1, team2]:
    gold_medal_winner = team1
    silver_medal_winner = team2
  else:
    gold_medal_winner = team2
    silver_medal_winner = team1

  # Simulate the bronze medal match.
  bronze_medal_winner = None
  fourth_place_team = None
  team1 = semifinal_losers[0]
  team2 = semifinal_losers[1]
  if np.random.rand() < probs.loc[team1, team2]:
    bronze_medal_winner = team1
    fourth_place_team = team2
  else:
    bronze_medal_winner = team2
    fourth_place_team = team1

  # Return the ranking.
  ranking = [
    gold_medal_winner,
    silver_medal_winner,
    bronze_medal_winner,
    fourth_place_team,
  ] + quarterfinal_losers + bottom_4_teams

  return ranking

### Tournament

In [40]:
def simulate_tournament(num_simulations, gender):
  """Simulates a specified number of tournaments using the given probabilities.

  Args:
    num_simulations: The number of simulations to run.
    gender: The gender of the tournament ('men' or 'women').
  """

  # Initialize the results dataframe
  results = pd.DataFrame(index=range(num_simulations), columns=['Ranking'])

  # Iterate over the simulations
  for i in range(num_simulations):

    # Get the list of teams for this simulation
    if gender == 'men':
      teams = men_teams
    elif gender == 'women':
      teams = women_teams
    else:
      raise ValueError("Invalid gender:", gender)

    # Get the probabilities of each country winning against another
    if gender == 'men':
      probs = men_probs
    elif gender == 'women':
      probs = women_probs
    else:
      raise ValueError("Invalid gender:", gender)

    # Simulate the preliminary phase
    preliminary_phase_results = simulate_preliminary_phase(teams, probs)

    # Simulate the final phase
    final_phase_results = simulate_final_phase(preliminary_phase_results, probs)

    # Get the winner of the tournament
    ranking = final_phase_results

    # Add the winner to the results dataframe
    results.loc[i, 'Ranking'] = ranking

  return results

### Simulations

In [None]:
num_simulations = 1000

# Simulate num_simulations tournaments for men and women
men_results = simulate_tournament(num_simulations, 'men')
women_results = simulate_tournament(num_simulations, 'women')

In [168]:
men_team_sum = {}
women_team_sum = {}

for team in men_teams:
  men_team_sum[team] = 0

for team in women_teams:
  women_team_sum[team] = 0

for id in men_results.index:
  ranking = men_results.loc[id, "Ranking"]
  for i in range(12):
     men_team_sum[ranking[i]] += i

for id in women_results.index:
  ranking = women_results.loc[id, "Ranking"]
  for i in range(12):
     women_team_sum[ranking[i]] += i

men_team_sum = dict(sorted(men_team_sum.items(), key=lambda item: item[1]))
women_team_sum = dict(sorted(women_team_sum.items(), key=lambda item: item[1]))

men_ranking = []
women_ranking = []

for team in men_team_sum:
  men_ranking += [team]

for team in women_team_sum:
  women_ranking += [team]

print("Men ranking:")
for i in range(12):
  print(str(i+1)+": "+men_ranking[i])
print()

print("Women ranking:")
for i in range(12):
  print(str(i+1)+": "+women_ranking[i])

Men ranking:
1: Poland
2: United States of America
3: France
4: Italy
5: Brazil
6: Japan
7: Slovenia
8: Argentina
9: Germany
10: Cuba
11: Serbia
12: Canada

Women ranking:
1: Turkey
2: Serbia
3: Poland
4: United States of America
5: Italy
6: China
7: Brazil
8: Dominican Republic
9: Japan
10: France
11: Netherlands
12: Canada
