### Read transformed data

In [76]:
import pandas as pd

df = pd.read_csv("atp_transformed/2000-2024 with covid.csv")

df.info()

  df = pd.read_csv("atp_transformed/2000-2024 with covid.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289980 entries, 0 to 1289979
Data columns (total 37 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   tourney_id               1289980 non-null  object 
 1   tourney_name             1289980 non-null  object 
 2   tourney_type             1289980 non-null  object 
 3   surface                  1289874 non-null  object 
 4   draw_size                1289980 non-null  int64  
 5   tourney_level            1289980 non-null  object 
 6   tourney_date             1289980 non-null  object 
 7   match_num                1289980 non-null  int64  
 8   score                    1289880 non-null  object 
 9   round_1                  1289810 non-null  object 
 10  round_2                  1284520 non-null  object 
 11  round_3                  433102 non-null   object 
 12  round_4                  25000 non-null    object 
 13  round_5                  6832 non-null    

In [77]:
unique_players = df['player_id'].nunique()
print(f"Number of unique players: {unique_players}")

Number of unique players: 19814


In [78]:
# Convert tourney_date to datetime
df['tourney_date_datetime'] = pd.to_datetime(df['tourney_date'])

# Extract year
df['year'] = df['tourney_date_datetime'].dt.year

In [79]:
player_year_rank = (
    df.groupby(['player_id', 'year'])['player_rank']
      .mean().round(2)
      .reset_index()
      .rename(columns={'player_rank': 'avg_rank_year'})
)

In [80]:
player_year_rank['rank_change'] = (
    player_year_rank
    .sort_values(['player_id', 'year'])
    .groupby('player_id')['avg_rank_year']
    .diff(periods=-1)  # or diff(periods=1), depending on direction you prefer
                       # A negative change means improvement (rank number decreased, better ranking).
                       # A positive change means decline (rank number increased).
)


In [81]:
df = df.merge(player_year_rank, on=['player_id', 'year'], how='left')

In [82]:
years_with_nan = df[df['rank_change'].isna()]['year'].unique()
print("Years with rank_change NaN:", years_with_nan)


Years with rank_change NaN: [2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024]


In [83]:
# Group by year and check if all rank_change are NaN
years_only_nan = df.groupby('year')['rank_change'].apply(lambda x: x.isna().all())

# Filter years where the result is True
years_only_nan = years_only_nan[years_only_nan].index.tolist()

print("Years where all rank_change values are NaN:", years_only_nan)


Years where all rank_change values are NaN: [2024]


# Rafaeal Nadal example:

In [None]:
nadal = df[df["player_name"] == "Rafael Nadal"]
nadal_per_year = (nadal[nadal['rank_change'].notna()].sort_values(by='year', ascending=False))
nadal_sorted = nadal_per_year.drop_duplicates(subset=['year'], keep='first')

nadal_sorted[['player_id', 'player_name', 'avg_rank_year', 'rank_change', 'year']]

Unnamed: 0,player_id,player_name,avg_rank_year,rank_change,year
1162501,104745,Rafael Nadal,2.0,-398.9,2023
1104700,104745,Rafael Nadal,4.29,2.29,2022
1061006,104745,Rafael Nadal,2.72,-1.57,2021
1041904,104745,Rafael Nadal,1.68,-1.04,2020
986170,104745,Rafael Nadal,1.88,0.2,2019
927004,104745,Rafael Nadal,1.1,-0.78,2018
865156,104745,Rafael Nadal,4.27,3.17,2017
800072,104745,Rafael Nadal,4.95,0.68,2016
737836,104745,Rafael Nadal,6.21,1.26,2015
677600,104745,Rafael Nadal,1.18,-5.03,2014


# 2023 example

In [None]:
# keep only rows where rank_change is not NaN
df_sorted = (df[df['rank_change'].notna()].sort_values(by=['year', 'player_rank', 'player_id'], ascending=[False, True, True]))
df_sorted = df_sorted.drop_duplicates(subset=['player_id'], keep='first')

df_sorted[['player_id', 'player_name', 'avg_rank_year', 'rank_change', 'year']].head(20)


Unnamed: 0,player_id,player_name,avg_rank_year,rank_change,year
1163193,104925,Novak Djokovic,2.29,0.52,2023
1163499,207989,Carlos Alcaraz,1.58,-1.02,2023
1162113,104745,Rafael Nadal,2.0,-398.9,2023
1164775,106421,Daniil Medvedev,4.72,0.33,2023
1162911,126774,Stefanos Tsitsipas,4.92,-5.24,2023
1162129,134770,Casper Ruud,4.92,-3.68,2023
1166849,206173,Jannik Sinner,9.02,7.13,2023
1166439,208029,Holger Rune,7.34,-4.49,2023
1162957,126094,Andrey Rublev,6.4,-0.16,2023
1163131,126203,Taylor Fritz,8.56,-2.37,2023
