In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv('../../preprocessing/data/matches.csv')

In [3]:
matches['Comment'].unique()

array(['Completed', 'Retired', 'Walkover', 'Awarded', 'Sched',
       'Disqualified'], dtype=object)

In [4]:
def calculate_injury_score(df, discount_factor=0):
    df = df.sort_values(by=['Date'])
    
    df['winner_injury_score'] = 0.0
    df['loser_injury_score'] = 0.0
    
    matches_since_last_retirement = {}
    
    for idx, row in df.iterrows():
        winner_id = row['winner_id']
        loser_id = row['loser_id']
        
        matches_since_last_retirement_winner = matches_since_last_retirement.get(winner_id, 10)
        matches_since_last_retirement_loser = matches_since_last_retirement.get(loser_id, 10)
        
        if 0 <= matches_since_last_retirement_winner <= 0:
            df.at[idx, 'winner_injury_score'] = discount_factor ** matches_since_last_retirement_winner
        if 0 <= matches_since_last_retirement_loser <= 0:
            df.at[idx, 'loser_injury_score'] = discount_factor ** matches_since_last_retirement_loser

        if row['Comment'] == 'Retired':  
            matches_since_last_retirement[loser_id] = 0 
        else:
            matches_since_last_retirement[loser_id] = matches_since_last_retirement.get(loser_id, 10) + 1
        matches_since_last_retirement[winner_id] = matches_since_last_retirement.get(winner_id, 10) + 1
    return df

In [5]:
matches = calculate_injury_score(matches)

In [6]:
matches[['winner_name', 'loser_name', 'winner_injury_score', 'loser_injury_score', 'tournament_name']]

Unnamed: 0,winner_name,loser_name,winner_injury_score,loser_injury_score,tournament_name
0,Alexandr Dolgopolov,Diego Schwartzman,0.0,0.0,Brisbane International
1,Alex De Minaur,Steve Johnson,0.0,0.0,Brisbane International
30,Nikoloz Basilashvili,Thomas Fabbiano,0.0,0.0,Qatar Exxon Mobil Open
29,Dominic Thiem,Evgeny Donskoy,0.0,0.0,Qatar Exxon Mobil Open
28,Fernando Verdasco,Dudi Sela,0.0,0.0,Qatar Exxon Mobil Open
...,...,...,...,...,...
14081,Alexander Shevchenko,Pierre Hugues Herbert,0.0,0.0,Open de Moselle
14108,Jack Draper,Jan Lennard Struff,0.0,0.0,Sofia Open
14109,Adrian Mannarino,Pavel Kotov,0.0,0.0,Sofia Open
14083,Ugo Humbert,Alexander Shevchenko,0.0,0.0,Open de Moselle


In [7]:
def calculate_percentage_lower_injury_wins(df):
    after_injury_matches = df[
        ((df['winner_injury_score'] > 0) | (df['loser_injury_score'] > 0))
        &
        (df['loser_rank'] - df['winner_rank'] < 10)
    ]
    print(after_injury_matches.shape[0])
    lower_injury_wins = after_injury_matches[
        after_injury_matches['winner_injury_score'] <= after_injury_matches['loser_injury_score']
    ].shape[0]
    
    
    percentage_lower_injury_wins = (lower_injury_wins / after_injury_matches.shape[0]) * 100
    
    return percentage_lower_injury_wins

In [8]:
print(calculate_percentage_lower_injury_wins(matches))

155
59.354838709677416


In [9]:
matches[['match_id', 'winner_injury_score', 'loser_injury_score']].to_csv("../../data/created_features_separate/injury.csv", index=False)