In [1]:
import pandas as pd

# Load the ATP dataset you uploaded
df = pd.read_csv(r"C:\Users\Joel'PC\Desktop\atp_matches\atp_matches_2024.csv")  # <-- change if the file is somewhere else

df.head()


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2024-0339,Brisbane,Hard,32,A,20240101,300,105777,2.0,,...,58.0,44.0,16.0,11.0,8.0,9.0,14.0,2570.0,8.0,3660.0
1,2024-0339,Brisbane,Hard,32,A,20240101,299,208029,1.0,,...,35.0,31.0,10.0,11.0,5.0,7.0,8.0,3660.0,39.0,1122.0
2,2024-0339,Brisbane,Hard,32,A,20240101,298,105777,2.0,,...,39.0,24.0,14.0,10.0,5.0,7.0,14.0,2570.0,55.0,902.0
3,2024-0339,Brisbane,Hard,32,A,20240101,297,208029,1.0,,...,51.0,31.0,16.0,10.0,3.0,5.0,8.0,3660.0,116.0,573.0
4,2024-0339,Brisbane,Hard,32,A,20240101,296,126128,,,...,37.0,27.0,16.0,10.0,5.0,8.0,39.0,1122.0,44.0,1021.0


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Data Cleaning ---
# We drop rows where any of these specific columns have missing values (NaN)
# If we don't know a player's height or rank, we can't predict the match!
df_clean = df.dropna(subset=['winner_rank', 'loser_rank', 'winner_age', 'loser_age', 
                             'winner_hand', 'loser_hand', 'winner_ht', 'loser_ht']).copy()

print(f"Original match count: {len(df)}")
print(f"Cleaned match count: {len(df_clean)}")

Original match count: 3076
Cleaned match count: 3000


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Data Cleaning ---
# We drop rows where any of these specific columns have missing values (NaN)
# If we don't know a player's height or rank, we can't predict the match!
df_clean = df.dropna(subset=['winner_rank', 'loser_rank', 'winner_age', 'loser_age', 
                             'winner_hand', 'loser_hand', 'winner_ht', 'loser_ht']).copy()

print(f"Original match count: {len(df)}")
print(f"Cleaned match count: {len(df_clean)}")

Original match count: 3076
Cleaned match count: 3000


In [6]:
# --- 1. Calculate Win Percentage for ALL Surfaces ---

def calculate_surface_stats(df, surface_name):
    """Calculates the win percentage for all players on a specified surface."""
    surface_matches = df[df['surface'] == surface_name]
    wins = surface_matches['winner_name'].value_counts()
    losses = surface_matches['loser_name'].value_counts()
    
    # Combine stats, filling NaN with 0
    stats = pd.DataFrame({'wins': wins, 'losses': losses}).fillna(0)
    stats['total'] = stats['wins'] + stats['losses']
    
    # Calculate percentage. Using + 1e-6 prevents division by zero.
    stats['win_pct'] = stats['wins'] / (stats['total'] + 1e-6)
    return stats['win_pct']

# Calculate stats for all three surfaces
hard_pct = calculate_surface_stats(df_clean, 'Hard')
clay_pct = calculate_surface_stats(df_clean, 'Clay')
grass_pct = calculate_surface_stats(df_clean, 'Grass')

# Helper function: Look up a player's name and return their Win %
def get_surface_pct(name, surface_series):
    if name in surface_series.index:
        return surface_series.loc[name]
    return 0.0 # Default if no matches played on that surface

# Map percentages to the cleaned dataframe
df_clean['w_hard_pct'] = df_clean['winner_name'].apply(lambda x: get_surface_pct(x, hard_pct))
df_clean['l_hard_pct'] = df_clean['loser_name'].apply(lambda x: get_surface_pct(x, hard_pct))
df_clean['w_clay_pct'] = df_clean['winner_name'].apply(lambda x: get_surface_pct(x, clay_pct))
df_clean['l_clay_pct'] = df_clean['loser_name'].apply(lambda x: get_surface_pct(x, clay_pct))
df_clean['w_grass_pct'] = df_clean['winner_name'].apply(lambda x: get_surface_pct(x, grass_pct))
df_clean['l_grass_pct'] = df_clean['loser_name'].apply(lambda x: get_surface_pct(x, grass_pct))

print("Feature Engineering Complete: All 3 Surface Percentages ready.")

Feature Engineering Complete: All 3 Surface Percentages ready.


In [7]:
# --- Restructure: Create P1 vs P2 ---
np.random.seed(42)
swap_mask = np.random.rand(len(df_clean)) > 0.5 

p1_data = []
p2_data = []
target = [] 

for i, (index, row) in enumerate(df_clean.iterrows()):
    
    # Define player data dictionary with ALL features
    p1_cols_winner = {'name': row['winner_name'], 'rank': row['winner_rank'], 'age': row['winner_age'],
               'hand': row['winner_hand'], 'ht': row['winner_ht'],
               'hard_pct': row['w_hard_pct'], 'clay_pct': row['w_clay_pct'], 'grass_pct': row['w_grass_pct']}
    p2_cols_loser = {'name': row['loser_name'], 'rank': row['loser_rank'], 'age': row['loser_age'],
               'hand': row['loser_hand'], 'ht': row['loser_ht'],
               'hard_pct': row['l_hard_pct'], 'clay_pct': row['l_clay_pct'], 'grass_pct': row['l_grass_pct']}
               
    if swap_mask[i]:
        # Swap: Loser becomes P1 (Target 0)
        p1_data.append(p2_cols_loser)
        p2_data.append(p1_cols_winner)
        target.append(0)
    else:
        # Keep: Winner stays P1 (Target 1)
        p1_data.append(p1_cols_winner)
        p2_data.append(p2_cols_loser)
        target.append(1)

# Build the DataFrame for the Machine Learning Model
df_model = pd.DataFrame({
    'surface': df_clean['surface'].values,
    # Static features
    'p1_rank': [d['rank'] for d in p1_data], 'p2_rank': [d['rank'] for d in p2_data],
    'p1_age': [d['age'] for d in p1_data],   'p2_age': [d['age'] for d in p2_data],
    'p1_hand': [d['hand'] for d in p1_data], 'p2_hand': [d['hand'] for d in p2_data],
    'p1_ht': [d['ht'] for d in p1_data],     'p2_ht': [d['ht'] for d in p2_data],
    # Surface Percentage features (ALL 3)
    'p1_hard_pct': [d['hard_pct'] for d in p1_data], 'p2_hard_pct': [d['hard_pct'] for d in p2_data],
    'p1_clay_pct': [d['clay_pct'] for d in p1_data], 'p2_clay_pct': [d['clay_pct'] for d in p2_data],
    'p1_grass_pct': [d['grass_pct'] for d in p1_data], 'p2_grass_pct': [d['grass_pct'] for d in p2_data],
    # Names and Target
    'p1_name': [d['name'] for d in p1_data], 'p2_name': [d['name'] for d in p2_data],
    'target': target
})

# --- Final Calculations (The "Differences") ---
df_model['rank_diff'] = df_model['p2_rank'] - df_model['p1_rank'] 
df_model['age_diff'] = df_model['p1_age'] - df_model['p2_age']
df_model['ht_diff'] = df_model['p1_ht'] - df_model['p2_ht']

# NEW Surface Difference Features
df_model['hard_diff'] = df_model['p1_hard_pct'] - df_model['p2_hard_pct']
df_model['clay_diff'] = df_model['p1_clay_pct'] - df_model['p2_clay_pct']
df_model['grass_diff'] = df_model['p1_grass_pct'] - df_model['p2_grass_pct']

# Encoding
hand_map = {'R': 0, 'L': 1, 'U': 0}
df_model['p1_hand_code'] = df_model['p1_hand'].map(hand_map).fillna(0)
df_model['p2_hand_code'] = df_model['p2_hand'].map(hand_map).fillna(0)
df_model = pd.get_dummies(df_model, columns=['surface'], prefix='surface')

print("Data successfully restructured and ready for AI training!")

Data successfully restructured and ready for AI training!


In [8]:
# --- 1. Select Features ---
features = [
    'p1_rank', 'p2_rank', 'rank_diff', 
    'age_diff', 
    'p1_hand_code', 'p2_hand_code', 
    'ht_diff',
    'hard_diff', 'clay_diff', 'grass_diff', # ALL surface differences added
    'surface_Hard', 'surface_Clay', 'surface_Grass'
]

X = df_model[features]
y = df_model['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 2. Train the Model ---
model = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=42)
model.fit(X_train, y_train)

# Check Accuracy
y_pred = model.predict(X_test)
print(f"Final Model Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# --- 3. Save for Power BI ---
output = X_test.copy()
output['Actual_Winner_Is_P1'] = y_test
output['Predicted_Winner_Is_P1'] = y_pred
output['Correct_Prediction'] = (y_test == y_pred)
output = output.join(df_model[['p1_name', 'p2_name']], how='left')
output['Predicted_Winner_Name'] = np.where(output['Predicted_Winner_Is_P1'] == 1, output['p1_name'], output['p2_name'])
output['Actual_Winner_Name'] = np.where(output['Actual_Winner_Is_P1'] == 1, output['p1_name'], output['p2_name'])

# Save with a new filename
output.to_csv('tennis_final_predictions_v4.csv', index=False)
print("SUCCESS! File 'tennis_final_predictions_v4.csv' saved. Time to visualize!")

Final Model Accuracy: 0.74
SUCCESS! File 'tennis_final_predictions_v4.csv' saved. Time to visualize!
