In [1]:
import pandas as pd
import glob
import os

# Path to your data directory
data_dir = '../data/TML-Database/'

# Get all year CSVs (exclude README and other non-year files)
csv_files = sorted([
    f for f in glob.glob(os.path.join(data_dir, '*.csv'))
    if os.path.basename(f)[:4].isdigit()
])

# Read and concatenate all CSVs
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)

print(f"Loaded {len(df)} matches from {len(csv_files)} files.")
df.head()

Loaded 193723 matches from 58 files.


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,1968-9343,Bloemfontein,Hard,64,A,19680108.0,,V125,,,...,,,,,,,,,,
1,1968-9343,Bloemfontein,Hard,64,A,19680108.0,,U011,,,...,,,,,,,,,,
2,1968-9343,Bloemfontein,Hard,64,A,19680108.0,,SS02,,,...,,,,,,,,,,
3,1968-9343,Bloemfontein,Hard,64,A,19680108.0,,S250,,,...,,,,,,,,,,
4,1968-9343,Bloemfontein,Hard,64,A,19680108.0,,R085,,,...,,,,,,,,,,


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# 1. Define target: 1 if winner, 0 if loser (already in your data as winner/loser columns)
# For simplicity, let's predict if the winner is the higher-ranked player
df['winner_higher_rank'] = (df['winner_rank'] < df['loser_rank']).astype(int)
target = 'winner_higher_rank'

# 2. Select features (example: you can add/remove as needed)
features = [
    'surface', 'round', 'winner_seed', 'winner_rank', 'winner_age', 'winner_ht',
    'loser_seed', 'loser_rank', 'loser_age', 'loser_ht'
]

# Drop rows with missing values in selected columns
df_model = df[features + [target]].dropna()

# One-hot encode categorical features
cat_features = ['surface', 'round']
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_cat = enc.fit_transform(df_model[cat_features])
X_num = df_model.drop(columns=cat_features + [target]).values
X = np.hstack([X_num, X_cat])
y = df_model[target].values

# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 5. Feature importances
importances = rf.feature_importances_
feature_names = list(df_model.drop(columns=cat_features + [target]).columns) + list(enc.get_feature_names_out(cat_features))
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)

print(importance_df)

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'