In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
atp = pd.read_csv('atp_tennis.csv')

df = pd.DataFrame(atp)

df['player1_win'] = (df['Winner'] == df['Player_1']).astype(int)


encoded_df = pd.get_dummies(df[['Tournament','Series','Surface','Round','Player_1','Player_2']], dtype='float')

df['Winner'].value_counts()

df['Court'] = df['Court'].map({'Indoor': 0, 'Outdoor': 1})

numeric_df = df[['Court','Best of','Rank_1','Rank_2','Pts_1','Pts_2','Odd_1','Odd_2']]


df['Date'] = pd.to_datetime(df['Date'])

df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

date_df=df[['year','month','day']]






In [9]:

def get_score_features(score):
    if not score or pd.isna(score):
        return {'won': 0, 'lost': 0, 'sets': 0, 'diff': 0}

    parts = score.split()
    won = 0
    lost = 0
    for p in parts:
        try:
            w, l = map(int, p.split('-'))
            won += w
            lost += l
        except:
            pass
    sets = len(parts)
    diff = abs(won - lost)
    return {'won': won, 'lost': lost, 'sets': sets, 'diff': diff}

features_df = df['Score'].apply(get_score_features).apply(pd.Series)
X=pd.concat([numeric_df,encoded_df,date_df,features_df],axis=1)


In [10]:

y=df['player1_win']
X_train,X_test,y_train,y_test= train_test_split(X , y , test_size=0.2)

rfmodel = RandomForestClassifier(n_estimators=100,max_depth=12, random_state=42)

rfmodel.fit(X_train , y_train)
prediction=rfmodel.predict(X_test)

accuracy=accuracy_score(y_test,prediction)
print(f"The accuracy %  = {accuracy}")

The accuracy %  = 0.8650371147934318


In [13]:
df

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,player1_win,year,month,day
0,Australian Hardcourt Championships,2000-01-03,International,1,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,...,77,-1,-1,-1.00,-1.00,6-4 6-2,1,2000,1,3
1,Australian Hardcourt Championships,2000-01-03,International,1,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,...,5,-1,-1,-1.00,-1.00,3-6 3-6,0,2000,1,3
2,Australian Hardcourt Championships,2000-01-03,International,1,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,...,655,-1,-1,-1.00,-1.00,6-7 7-5 6-3,1,2000,1,3
3,Australian Hardcourt Championships,2000-01-03,International,1,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,...,65,-1,-1,-1.00,-1.00,1-6 4-6,0,2000,1,3
4,Australian Hardcourt Championships,2000-01-03,International,1,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,...,198,-1,-1,-1.00,-1.00,7-6 5-7 6-4,1,2000,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66676,Masters Cup,2025-11-14,Masters Cup,0,Hard,Round Robin,3,Sinner J.,Shelton B.,Sinner J.,...,5,10000,3970,1.05,10.00,6-3 7-6,1,2025,11,14
66677,Masters Cup,2025-11-14,Masters Cup,0,Hard,Round Robin,3,Zverev A.,Auger-Aliassime F.,Auger-Aliassime F.,...,8,4960,3845,4.50,1.20,4-6 6-7,0,2025,11,14
66678,Masters Cup,2025-11-15,Masters Cup,0,Hard,Semifinals,3,Sinner J.,De Minaur A.,Sinner J.,...,7,10000,3935,1.05,10.00,7-5 6-2,1,2025,11,15
66679,Masters Cup,2025-11-15,Masters Cup,0,Hard,Semifinals,3,Auger-Aliassime F.,Alcaraz C.,Alcaraz C.,...,1,3845,11050,4.50,1.20,2-6 4-6,0,2025,11,15
