In [160]:
import pandas as pd

df = pd.read_csv(r"concatenated_data.csv")

# Remove entries where the "Game Scores" column is NaN
df = df.dropna(subset=["Game Scores"])


print(df)

      Match Date  Event Type  \
0     2019-03-31  Club Match   
1     2019-03-29  Club Match   
2     2019-03-28      League   
3     2019-03-28      League   
4     2019-03-28      League   
...          ...         ...   
1622  2023-06-22  Tournament   
1623  2023-06-22  Tournament   
1624  2022-11-23  Tournament   
1625  2023-06-23  Tournament   
1626  2023-03-08  Tournament   

                                             Event Name  \
0                               Vancouver Racquets Club   
1                               Vancouver Racquets Club   
2                       Vancouver Ladies Doubles League   
3                       Vancouver Ladies Doubles League   
4                       Vancouver Ladies Doubles League   
...                                                 ...   
1622         Evergreen Summer Doubles Squash Tournament   
1623         Evergreen Summer Doubles Squash Tournament   
1624      Western Canadian Doubles Squash Championships   
1625         Evergreen Su

In [161]:
df["Match Date"] = pd.to_datetime(df["Match Date"])
df = df.sort_values(by="Match Date")

In [162]:
print(df.columns)

Index(['Match Date', 'Event Type', 'Event Name', 'Game Scores', 'Player A1',
       'Player A1 Gender', 'Player A2', 'Player A2 Gender', 'Player B1',
       'Player B1 Gender', 'Player B2', 'Player B2 Gender'],
      dtype='object')


In [163]:
df = df[
    [
        "Match Date",
        "Event Name",
        "Game Scores",
        "Player A1",
        "Player A2",
        "Player B1",
        "Player B2",
    ]
]

In [164]:
# Function to calculate expected win probability based on Elo ratings
def expected_win_probability(player_elo, opponent_elo):
    return 1 / (1 + 10 ** ((opponent_elo - player_elo) / 400))

In [165]:
import re


def extract_first_scores(game_score):
    # Define regular expression to match scores in the format "n-n"
    score_pattern = re.compile(r"(\d+)-(\d+)")

    # Find all matches of score pattern in the game score
    matches = score_pattern.findall(game_score)

    if matches:
        # Extract score A and score B from the first match
        score_a, score_b = map(int, matches[0])
        return score_a, score_b
    else:
        print(game_score)
        return None, None  # No valid scores found


def extract_all_scores(game_score):
    # Define regular expression to match scores in the format "n-n"
    score_pattern = re.compile(r"(\d+)-(\d+)")

    # Find all matches of score pattern in the game score
    matches = score_pattern.findall(game_score)

    if matches:
        # Extract all scores
        scores = [(int(score_a), int(score_b)) for score_a, score_b in matches]
        return scores
    else:
        return None  # No valid scores found

In [166]:
def elo_sys_with_scorefactor_and_dynamic_k(elo_ratings):
    # Initialize parameters for the Elo system
    starting_elo = 1200  # Starting Elo rating for new players
    
    # Function to calculate K-factor based on Elo rating
    def calculate_k_factor(elo_rating):
        if elo_rating < 2000:
            return 30
        elif elo_rating < 2400:
            return 20
        else:
            return 10

    # Iterate through each row (match) in the DataFrame
    for index, row in df.iterrows():
        # Extract match details
        game_score = row["Game Scores"]
        scores = extract_all_scores(str(game_score))
        for score in scores:
            score_a = score[0]
            score_b = score[1]

            # Extract player names
            player_a1 = row["Player A1"]
            player_a2 = row["Player A2"]
            player_b1 = row["Player B1"]
            player_b2 = row["Player B2"]

            # Initialize Elo ratings for new players
            for player in [player_a1, player_a2, player_b1, player_b2]:
                if player not in elo_ratings:
                    elo_ratings[player] = starting_elo

            # Calculate expected win probability for each team
            team_a_elo = (elo_ratings[player_a1] + elo_ratings[player_a2]) / 2
            team_b_elo = (elo_ratings[player_b1] + elo_ratings[player_b2]) / 2
            expected_win_a = expected_win_probability(team_a_elo, team_b_elo)
            expected_win_b = 1 - expected_win_a

            # Update Elo ratings based on actual outcome
            k_factor_a = calculate_k_factor(team_a_elo)
            k_factor_b = calculate_k_factor(team_b_elo)

            # Determine the score difference factor
            score_difference = abs(score_a - score_b)
            score_factor = 1 + (score_difference / 3)  # Adjust this factor as needed

            if score_a > score_b:
                # Team A won
                elo_ratings[player_a1] += k_factor_a * score_factor * (1 - expected_win_a)
                elo_ratings[player_a2] += k_factor_a * score_factor * (1 - expected_win_a)
                elo_ratings[player_b1] += k_factor_b * score_factor * (0 - expected_win_b)
                elo_ratings[player_b2] += k_factor_b * score_factor * (0 - expected_win_b)
            elif score_a < score_b:
                # Team B won
                elo_ratings[player_a1] += k_factor_a * score_factor * (0 - expected_win_a)
                elo_ratings[player_a2] += k_factor_a * score_factor * (0 - expected_win_a)
                elo_ratings[player_b1] += k_factor_b * score_factor * (1 - expected_win_b)
                elo_ratings[player_b2] += k_factor_b * score_factor * (1 - expected_win_b)
            else:
                # Draw
                pass
    return elo_ratings

In [195]:
def elo_sys_baseline(elo_ratings):
    # Initialize parameters for the Elo system
    starting_elo = 1200  # Starting Elo rating for new players

    # Iterate through each row (match) in the DataFrame
    for index, row in df.iterrows():
        # Extract match details
        # Extract match details
        # Extract match details
        game_score = row["Game Scores"]
        scores = extract_all_scores(str(game_score))
        for score in scores:
            score_a = score[0]
            score_b = score[1]

            # Extract player names
            player_a1 = row["Player A1"]
            player_a2 = row["Player A2"]
            player_b1 = row["Player B1"]
            player_b2 = row["Player B2"]

            # Initialize Elo ratings for new players
            for player in [player_a1, player_a2, player_b1, player_b2]:
                if player not in elo_ratings:
                    elo_ratings[player] = starting_elo

            # Calculate expected win probability for each team
            team_a_elo = (elo_ratings[player_a1] + elo_ratings[player_a2]) / 2
            team_b_elo = (elo_ratings[player_b1] + elo_ratings[player_b2]) / 2
            expected_win_a = expected_win_probability(team_a_elo, team_b_elo)
            expected_win_b = 1 - expected_win_a

            # Update Elo ratings based on actual outcome
            k_factor_a = 32
            k_factor_b = 32

            if score_a > score_b:
                # Team A won
                elo_ratings[player_a1] += k_factor_a * (1 - expected_win_a)
                elo_ratings[player_a2] += k_factor_a * (1 - expected_win_a)
                elo_ratings[player_b1] += k_factor_b  * (0 - expected_win_b)
                elo_ratings[player_b2] += k_factor_b  * (0 - expected_win_b)
            elif score_a < score_b:
                # Team B won
                elo_ratings[player_a1] += k_factor_a * (0 - expected_win_a)
                elo_ratings[player_a2] += k_factor_a * (0 - expected_win_a)
                elo_ratings[player_b1] += k_factor_b * (1 - expected_win_b)
                elo_ratings[player_b2] += k_factor_b * (1 - expected_win_b)
            else:
                # Draw
                pass
    return elo_ratings

In [168]:
# Function to calculate expected win probability based on Elo ratings
def expected_win_probability_norm(player_elo, opponent_elo):
    return 1 / (1 + 10 ** ((opponent_elo - player_elo) / 1.2))

In [169]:
def elo_sys_with_scorefactor_and_dynamic_k_normalized(elo_ratings):
    # Initialize parameters for the Elo system
    starting_elo = 1.5  # Starting Elo rating for new players

    # Function to calculate K-factor based on Elo rating
    def calculate_k_factor(elo_rating):
        if elo_rating < 2000:
            return 0.5
        elif elo_rating < 2400:
            return 0.2
        else:
            return 0.1

    # Iterate through each row (match) in the DataFrame
    for index, row in df.iterrows():
        # Extract match details
        game_score = row["Game Scores"]
        scores = extract_all_scores(str(game_score))
        for score in scores:
            score_a = score[0]
            score_b = score[1]

            # Extract player names
            player_a1 = row["Player A1"]
            player_a2 = row["Player A2"]
            player_b1 = row["Player B1"]
            player_b2 = row["Player B2"]

            # Initialize Elo ratings for new players
            for player in [player_a1, player_a2, player_b1, player_b2]:
                if player not in elo_ratings:
                    elo_ratings[player] = starting_elo

            # Calculate expected win probability for each team
            team_a_elo = (elo_ratings[player_a1] + elo_ratings[player_a2]) / 2
            team_b_elo = (elo_ratings[player_b1] + elo_ratings[player_b2]) / 2
            expected_win_a = expected_win_probability_norm(team_a_elo, team_b_elo)
            expected_win_b = 1 - expected_win_a

            # Update Elo ratings based on actual outcome
            k_factor_a = calculate_k_factor(team_a_elo)
            k_factor_b = calculate_k_factor(team_b_elo)

            # Determine the score difference factor
            score_difference = abs(score_a - score_b)
            score_factor = 1 + (score_difference / 3)  # Adjust this factor as needed

            if score_a > score_b:
                # Team A won
                elo_ratings[player_a1] += k_factor_a * score_factor * (1 - expected_win_a)
                elo_ratings[player_a2] += k_factor_a * score_factor * (1 - expected_win_a)
                elo_ratings[player_b1] += k_factor_b * score_factor * (0 - expected_win_b)
                elo_ratings[player_b2] += k_factor_b * score_factor * (0 - expected_win_b)
            elif score_a < score_b:
                # Team B won
                elo_ratings[player_a1] += k_factor_a * score_factor * (0 - expected_win_a)
                elo_ratings[player_a2] += k_factor_a * score_factor * (0 - expected_win_a)
                elo_ratings[player_b1] += k_factor_b * score_factor * (1 - expected_win_b)
                elo_ratings[player_b2] += k_factor_b * score_factor * (1 - expected_win_b)
            else:
                # Draw
                pass
    return elo_ratings

In [170]:
def check_res(name):
    # Initialize win and loss counters
    wins = 0
    losses = 0
    league_wins = 0
    league_losses = 0
    championship_wins = 0
    championship_losses = 0

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Extract match details
        game_score = row["Game Scores"]
        scores = extract_all_scores(str(game_score))
        for score in scores:
            score_a, score_b = score

            # Extract player names (assuming the capitalization of 'p' in 'Player')
            player_a1 = row["Player A1"]
            player_a2 = row["Player A2"]
            player_b1 = row["Player B1"]
            player_b2 = row["Player B2"]

            # Check if player is part of the winning team
            if name in [player_a1, player_a2] and score_a > score_b:
                wins += 1
                # Check if the event name contains "League" or "Championships"
                if "League" in row["Event Name"]:
                    league_wins += 1
                elif "Championships" in row["Event Name"]:
                    championship_wins += 1
            elif name in [player_b1, player_b2] and score_b > score_a:
                wins += 1
                # Check if the event name contains "League" or "Championships"
                if "League" in row["Event Name"]:
                    league_wins += 1
                elif "Championships" in row["Event Name"]:
                    championship_wins += 1
            elif name in [player_a1, player_a2] or name in [player_b1, player_b2]:
                losses += 1
                # Check if the event name contains "League" or "Championships"
                if "League" in row["Event Name"]:
                    league_losses += 1
                elif "Championships" in row["Event Name"]:
                    championship_losses += 1

    return (
        wins,
        losses,
        league_wins,
        league_losses,
        championship_wins,
        championship_losses,
    )

In [196]:
def elo_sys_with_event_normalized(elo_ratings):
    # Initialize parameters for the Elo system
    starting_elo = 1.5  # Starting Elo rating for new players

    # Function to calculate K-factor based on Elo rating
    def calculate_k_factor(elo_rating):
        if elo_rating < 2000:
            return 0.5
        elif elo_rating < 2400:
            return 0.2
        else:
            return 0.1

    # Iterate through each row (match) in the DataFrame
    for index, row in df.iterrows():
        # Extract match details
        game_score = row["Game Scores"]
        scores = extract_all_scores(str(game_score))
        for score in scores:
            score_a = score[0]
            score_b = score[1]

            # Extract player names
            player_a1 = row["Player A1"]
            player_a2 = row["Player A2"]
            player_b1 = row["Player B1"]
            player_b2 = row["Player B2"]

            # Initialize Elo ratings for new players
            for player in [player_a1, player_a2, player_b1, player_b2]:
                if player not in elo_ratings:
                    elo_ratings[player] = starting_elo

            # Calculate expected win probability for each team
            team_a_elo = (elo_ratings[player_a1] + elo_ratings[player_a2]) / 2
            team_b_elo = (elo_ratings[player_b1] + elo_ratings[player_b2]) / 2
            expected_win_a = expected_win_probability_norm(team_a_elo, team_b_elo)
            expected_win_b = 1 - expected_win_a

            # Update Elo ratings based on actual outcome
            k_factor_a = calculate_k_factor(team_a_elo)
            k_factor_b = calculate_k_factor(team_b_elo)

            # Determine the score difference factor
            score_difference = abs(score_a - score_b)
            score_factor = 1 + (score_difference / 3)  # Adjust this factor as needed

            # Check if the event name contains "League" or "Championships"
            event_name = row["Event Name"]
            if "League" in event_name:
                score_factor *= 1.5
            elif "Championships" in event_name:
                score_factor *= 6  # Weight the match by a factor of 5

            if score_a > score_b:
                # Team A won
                elo_ratings[player_a1] += (
                    k_factor_a * score_factor * (1 - expected_win_a)
                )
                elo_ratings[player_a2] += (
                    k_factor_a * score_factor * (1 - expected_win_a)
                )
                elo_ratings[player_b1] += (
                    k_factor_b * score_factor * (0 - expected_win_b)
                )
                elo_ratings[player_b2] += (
                    k_factor_b * score_factor * (0 - expected_win_b)
                )
            elif score_a < score_b:
                # Team B won
                elo_ratings[player_a1] += (
                    k_factor_a * score_factor * (0 - expected_win_a)
                )
                elo_ratings[player_a2] += (
                    k_factor_a * score_factor * (0 - expected_win_a)
                )
                elo_ratings[player_b1] += (
                    k_factor_b * score_factor * (1 - expected_win_b)
                )
                elo_ratings[player_b2] += (
                    k_factor_b * score_factor * (1 - expected_win_b)
                )
            else:
                # Draw
                pass
        df.at[index, "Player A1 Elo Rating"] = elo_ratings[player_a1]
        df.at[index, "Player A2 Elo Rating"] = elo_ratings[player_a2]
        df.at[index, "Player B1 Elo Rating"] = elo_ratings[player_b1]
        df.at[index, "Player B2 Elo Rating"] = elo_ratings[player_b2]
    return elo_ratings

In [197]:
# Initialize Elo ratings for each player
elo_ratings = {}
elo_ratings = elo_sys_with_scorefactor_and_dynamic_k(elo_ratings)
elo_ratings_base = {}
elo_ratings_base = elo_sys_baseline(elo_ratings_base)
elo_ratings_norm = {}
elo_ratings_norm = elo_sys_with_scorefactor_and_dynamic_k_normalized(elo_ratings_norm)
elo_ratings_event = {}
elo_ratings_event = elo_sys_with_event_normalized(elo_ratings_event)

# Display or save the updated Elo ratings for each player
sorted_elo_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
sorted_elo_ratings_base = sorted(
    elo_ratings_base.items(), key=lambda x: x[1], reverse=True
)
sorted_elo_ratings_norm = sorted(
    elo_ratings_norm.items(), key=lambda x: x[1], reverse=True
)
sorted_elo_ratings_event = sorted(
    elo_ratings_event.items(), key=lambda x: x[1], reverse=True
)

In [190]:
# Display the top 5 players for each system
print("Top 5 Players - System: elo_sys_with_scorefactor_and_dynamic_k")
for player, elo in sorted_elo_ratings[:5]:
    print(f"Player: {player}, Elo: {elo}")

print("\nTop 5 Players - System: elo_sys_baseline")
for player, elo in sorted_elo_ratings_base[:5]:
    print(f"Player: {player}, Elo: {elo}")

print("\nTop 5 Players - System: elo_sys_with_scorefactor_and_dynamic_k_normalized")
for player, elo in sorted_elo_ratings_norm[:5]:
    print(f"Player: {player}, Elo: {elo}")

print("\nTop 5 Players - System: elo_sys_with_eventfactor_normalized")
for player, elo in sorted_elo_ratings_event[:5]:
    print(f"Player: {player}, Elo: {elo}")

Top 5 Players - System: elo_sys_with_scorefactor_and_dynamic_k
Player: Emma Parke, Elo: 1635.697660922337
Player: Farzin Habibpour, Elo: 1622.0613495355708
Player: Thomas Brinkman, Elo: 1613.900745843629
Player: Fungpiew Lim, Elo: 1587.6739179356891
Player: Byron Kidd, Elo: 1566.8512562115618

Top 5 Players - System: elo_sys_baseline
Player: Andrew Smart, Elo: 1473.9084941988979
Player: Brian Covernton, Elo: 1428.4560386082621
Player: Anastasiya Spivak, Elo: 1343.6619576163125
Player: Jeff Boag, Elo: 1342.8480049555312
Player: Tessa Breukels, Elo: 1342.749456387798

Top 5 Players - System: elo_sys_with_scorefactor_and_dynamic_k_normalized
Player: Andrew Smart, Elo: 5.732077548608849
Player: Justin Todd, Elo: 5.281914255754639
Player: Ingus Silgailis, Elo: 5.14038536173418
Player: Grant Bergman, Elo: 5.027828000814217
Player: Emma Parke, Elo: 4.974744598855068

Top 5 Players - System: elo_sys_with_eventfactor_normalized
Player: Grant Bergman, Elo: 14.255513673362906
Player: Matthew Toth

In [191]:
from tabulate import tabulate

# Combine Elo ratings for all systems
combined_ratings = {
    "System: elo_sys_with_scorefactor_and_dynamic_k": sorted(
        elo_ratings.items(), key=lambda x: x[1], reverse=True
    ),
    "System: elo_sys_baseline": sorted(
        elo_ratings_base.items(), key=lambda x: x[1], reverse=True
    ),
    "System: elo_sys_with_scorefactor_and_dynamic_k_normalized": sorted(
        elo_ratings_norm.items(), key=lambda x: x[1], reverse=True
    ),
    "System: elo_sys_with_event": sorted(
        elo_ratings_event.items(), key=lambda x: x[1], reverse=True
    ),
}

# Create a table for the top 10 players for each system
table_data = []
for system, ratings in combined_ratings.items():
    top_10_players = [(player, elo) for player, elo in ratings[:10]]
    table_data.append([system] + top_10_players)

# Transpose the table
transposed_table_data = list(zip(*table_data))

# Print the transposed table
print(tabulate(transposed_table_data, tablefmt="grid"))

+------------------------------------------------+-------------------------------------------+-----------------------------------------------------------+---------------------------------------------+
| System: elo_sys_with_scorefactor_and_dynamic_k | System: elo_sys_baseline                  | System: elo_sys_with_scorefactor_and_dynamic_k_normalized | System: elo_sys_with_event                  |
+------------------------------------------------+-------------------------------------------+-----------------------------------------------------------+---------------------------------------------+
| ('Emma Parke', 1635.697660922337)              | ('Andrew Smart', 1473.9084941988979)      | ('Andrew Smart', 5.732077548608849)                       | ('Grant Bergman', 14.255513673362906)       |
+------------------------------------------------+-------------------------------------------+-----------------------------------------------------------+------------------------------------------

In [175]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
data = pd.read_csv("concatenated_data.csv")

# Get the unique entries in the "Event Name" column
unique_event_names = data["Event Name"].unique()

# Print the unique event names
print("Unique Event Names:")
for event_name in unique_event_names:
    print(event_name)

Unique Event Names:
Vancouver Racquets Club
Vancouver Ladies Doubles League
Vancouver Doubles Squash League (VDSL)
Evergreen Squash Club
B.C. Open Doubles Championships March 5-10, 2019
Vancouver Lawn Tennis Club
Hollyburn Country Club
2018 Western Canadian Doubles
2018 Early Bird Singles & Doubles
2018 Evergreen Summer Doubles
2018 Hollyburn 'Doubles' Championships - Finals
2018 Hollyburn 'Doubles' Championships - Round Robins
2018 Evergreen Club Championships
Vancouver Open Doubles League
Random League
Evergreen Summer Doubles Squash Tournament
2023 Western Canadian Doubles Squash Championships
Hillside Wealth Management 2023 BC Doubles Championships
Western Canadian Doubles Squash Championships


In [176]:
# Combine Elo ratings for all systems
combined_ratings = {
    "System: elo_sys_with_scorefactor_and_dynamic_k": sorted(
        elo_ratings.items(), key=lambda x: x[1], reverse=True
    ),
    "System: elo_sys_baseline": sorted(
        elo_ratings_base.items(), key=lambda x: x[1], reverse=True
    ),
    "System: elo_sys_with_scorefactor_and_dynamic_k_normalized": sorted(
        elo_ratings_norm.items(), key=lambda x: x[1], reverse=True
    ),
}

# Create a table for the top 10 and worst 10 players for each system
table_data = []
for system, ratings in combined_ratings.items():
    worst_10_players = [(player, elo) for player, elo in ratings[-10:]]
    table_data.append([system] + worst_10_players)

# Transpose the table
transposed_table_data = list(zip(*table_data))

# Print the transposed table
print(tabulate(transposed_table_data, tablefmt="grid"))

+------------------------------------------------+------------------------------------------+-----------------------------------------------------------+
| System: elo_sys_with_scorefactor_and_dynamic_k | System: elo_sys_baseline                 | System: elo_sys_with_scorefactor_and_dynamic_k_normalized |
+------------------------------------------------+------------------------------------------+-----------------------------------------------------------+
| ('Margaret Colbourne', 896.155468712473)       | ('Annika Lakhani', 1075.7343322958775)   | ('Richard Dustan', -1.0197029948919047)                   |
+------------------------------------------------+------------------------------------------+-----------------------------------------------------------+
| ('Stacy Hall', 893.5534364860983)              | ('Gary Bombay', 1066.826703300336)       | ('Paul Brebner', -1.223996698234998)                      |
+------------------------------------------------+--------------------------

In [194]:
names = [
    "Andrew Smart",
    "Brian Covernton",
    "Justin Todd",
    "Annette Johanson",
    "Jeff Ward",
    "Anastasiya Spivak",
    "Melissa Troll"
]

for name in names:
    wins, losses, league_wins, league_losses, championship_wins, championship_losses = (
        check_res(name)
    )
    print(f"{name}'s Win-Loss Record: {wins}-{losses}")
    print(f"  League Wins: {league_wins}, League Losses: {league_losses}")
    print(
        f"  Championship Wins: {championship_wins}, Championship Losses: {championship_losses}"
    )

Andrew Smart's Win-Loss Record: 23-7
  League Wins: 11, League Losses: 5
  Championship Wins: 6, Championship Losses: 1
Brian Covernton's Win-Loss Record: 60-35
  League Wins: 9, League Losses: 7
  Championship Wins: 6, Championship Losses: 5
Justin Todd's Win-Loss Record: 32-22
  League Wins: 7, League Losses: 4
  Championship Wins: 19, Championship Losses: 14
Annette Johanson's Win-Loss Record: 45-65
  League Wins: 25, League Losses: 31
  Championship Wins: 2, Championship Losses: 3
Jeff Ward's Win-Loss Record: 18-43
  League Wins: 5, League Losses: 8
  Championship Wins: 2, Championship Losses: 3
Anastasiya Spivak's Win-Loss Record: 28-18
  League Wins: 8, League Losses: 6
  Championship Wins: 20, Championship Losses: 12
Melissa Troll's Win-Loss Record: 77-65
  League Wins: 45, League Losses: 38
  Championship Wins: 19, Championship Losses: 13


In [203]:
def generate_match_outcomes(df):
    # Initialize list to store match outcomes
    match_outcomes = []

    # Iterate through each row (match) in the DataFrame
    for index, row in df.iterrows():
        # Extract match details
        game_score = row["Game Scores"]
        scores = extract_all_scores(str(game_score))

        # Initialize counters for wins of each player
        wins_player_a = 0
        wins_player_b = 0

        # Count wins for each player
        if scores:
            for score_a, score_b in scores:
                if score_a > score_b:
                    wins_player_a += 1
                elif score_a < score_b:
                    wins_player_b += 1

        # Determine the match outcome based on wins
        if wins_player_a > wins_player_b:
            match_outcomes.append(1)  # Player A wins
        elif wins_player_a < wins_player_b:
            match_outcomes.append(0)  # Player B wins
        else:
            match_outcomes.append(None)  # Draw

    # Add match outcomes as a new column to the DataFrame
    df["Match Outcome"] = match_outcomes

    return df


# Example usage:
# Assuming 'data' is your DataFrame containing the dataset
data_with_match_outcomes = generate_match_outcomes(df)

In [205]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Prepare the features (Elo ratings) and target variable (match outcomes)
features = data_with_match_outcomes[
    [
        "Player A1 Elo Rating",
        "Player A2 Elo Rating",
        "Player B1 Elo Rating",
        "Player B2 Elo Rating",
    ]
]
target = data_with_match_outcomes["Match Outcome"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Make predictions
# Assuming 'new_data_features' is the DataFrame containing new data with Elo rating columns
predictions = model.predict(new_data_features)

Accuracy: 0.9230769230769231


In [212]:
# Select a subset of matches from the existing dataset to use as the test set
subset_of_matches = data_with_match_outcomes.sample(
    n=1, random_state=42
)  # Change the number as needed

print(subset_of_matches)

# Prepare the features for the test set
X_test_subset = subset_of_matches[
    [
        "Player A1 Elo Rating",
        "Player A2 Elo Rating",
        "Player B1 Elo Rating",
        "Player B2 Elo Rating",
    ]
]
y_test_subset = subset_of_matches["Match Outcome"]

# Make predictions on the test set
predictions = model.predict(X_test_subset)

# Evaluate the predictions
accuracy = accuracy_score(y_test_subset, predictions)
print("Accuracy on the test set:", accuracy)

     Match Date             Event Name Game Scores      Player A1  \
1097 2018-05-22  Evergreen Squash Club     ['3-0']  Malcolm Moore   

            Player A2       Player B1     Player B2  Player A1 Elo Rating  \
1097  Snehal Lakhani   Donna  Pakaluk  Mike Lavigne              7.929481   

      Player A2 Elo Rating  Player B1 Elo Rating  Player B2 Elo Rating  \
1097             -4.396873              1.248911              0.140296   

      Match Outcome  
1097              1  
Accuracy on the test set: 1.0
