In [None]:
# import all libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import math

In [None]:
# load csv
full_df = pd.read_csv('atp_transformed/2000-2024 players_2.csv')

In [None]:
selected_features = ['surface', 
'tourney_level', 
#'tourney_date', dropped because not numeric 
'match_num', 
'player_seed', 
'player_height', 
'player_country', 
'player_age', 
#'elo_pre_match', dropped because of leakage
#'opponent_elo_pre_match', 
'opponent_rank', 
'mean_numb', 
'median_numb', 
'total_numb', 
'mean_diff', 
'median_diff', 
'total_diff', 
'mean_tb_numb', 
'median_tb_numb', 
'total_tb_numb', 
'mean_tb_diff', 
'median_tb_diff', 
'total_tb_diff', 
'days_of_experience',
'career_year', 
'rest_days', 
'set_dominance', 
'tb_dominance', 
'highest_finish_position', 
'minutes_rolling_med_10', 
'draw_size_rolling_med_10', 
'highest_finish_position_rolling_med_10', 
'ace_rolling_mean_10', 
'double_faults_rolling_mean_10', 
'points_on_serve_rolling_mean_10', 
'first_serve_in_rolling_mean_10', 
'1stWon_rolling_mean_10', 
'2ndWon_rolling_mean_10', 
'service_games_rolling_mean_10', 
'break_points_saved_rolling_mean_10', 
'break_points_faced_rolling_mean_10', 
# 'elo_pre_match_rolling_mean_10', dropped because of leakage
# 'opponent_elo_pre_match_rolling_mean_10', 
'set_dominance_rolling_mean_10', 
'tb_dominance_rolling_mean_10', 
'player_rank_rolling_mean_10', 
'mean_numb_rolling_mean_10', 
'median_numb_rolling_mean_10', 
'total_numb_rolling_mean_10', 
'mean_diff_rolling_mean_10', 
'median_diff_rolling_mean_10', 
'total_diff_rolling_mean_10', 
'mean_tb_numb_rolling_mean_10', 
'median_tb_numb_rolling_mean_10', 
'total_tb_numb_rolling_mean_10', 
'mean_tb_diff_rolling_mean_10', 
'median_tb_diff_rolling_mean_10', 
'total_tb_diff_rolling_mean_10', 
'elo_next_match',
'binned_rank' # target
]

In [None]:
# drop all nan values
full_df['player_rank'] = full_df['player_rank'].dropna()

num_bins = 20

# Bin the ranks
full_df['binned_rank'] = pd.qcut(full_df['player_rank'], q=num_bins, labels=False) 

In [None]:
# select all features to use in the prediction
df_subset = full_df[selected_features]

# drop all values where nan because knn cannot deal with empty values
df_subset = df_subset.dropna()

In [None]:
# Print non numeric columns
#numeric_cols = df_subset.select_dtypes(exclude=[np.number]).columns
#print(numeric_cols)

# encode non numeric values
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop first to avoid multicollinearity
surface_encoded = encoder.fit_transform(df_subset[['surface','player_country','tourney_level']])
surface_df = pd.DataFrame(surface_encoded, 
                          columns=encoder.get_feature_names_out(['surface','player_country','tourney_level']),
                          index=df_subset.index)
df_encoded = pd.concat([df_subset.drop(['surface','player_country','tourney_level'], axis=1), surface_df], axis=1)


In [None]:
# prepare data
X = df_encoded.drop('binned_rank', axis=1)  # Features
y = df_encoded['binned_rank']  # Target

In [None]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# scale features to normalise
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# train
knn = KNeighborsClassifier(n_neighbors=2) # using trial and error 2 seemed to be the best score but it's still bad
knn.fit(X_train_scaled, y_train)

In [None]:
# this code says that k=1 is the best k, but k=1 is not going to work on other models
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': list(range(1, 50))
}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)


In [None]:
# predict
y_pred = knn.predict(X_test_scaled)

In [None]:
# metrics
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Check if there's overfitting
train_score = knn.score(X_train_scaled, y_train)
test_score = knn.score(X_test_scaled, y_test)
print(f"Train accuracy: {train_score:.4f}")
print(f"Test accuracy: {test_score:.4f}")

In [None]:
bin_edges = pd.qcut(full_df['player_rank'], q=num_bins).cat.categories

# Map numeric predictions to readable strings
bin_labels = [f"{round(interval.left,2)}-{round(interval.right,2)}" for interval in bin_edges]
y_pred_str = [bin_labels[i] for i in y_pred]
y_test_str = [bin_labels[j] for j in y_test]

In [None]:
results = X_test.reset_index(drop=True).copy()
results['true_rank_coded'] = y_test.reset_index(drop=True)
results['pred_rank_coded'] = y_pred
results['true_rank'] = y_test_str
results['pred_rank'] = y_pred_str
results['correct'] = results['true_rank'] == results['pred_rank']
results['number_of_bins_off'] = results['true_rank_coded'] - results['pred_rank_coded']

In [None]:
wrong_preds = results[results['correct'] == False]

In [None]:
counts = wrong_preds['number_of_bins_off'].value_counts().sort_index()

plt.figure(figsize=(8,5))
plt.bar(counts.index.astype(str), counts.values)
plt.xlabel("Number of Bins Off")
plt.ylabel("Count of Samples")
plt.title("Distribution of Prediction Errors (using bins=20 and k=2)")
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
wrong_preds