# Djokovic ML model

Here we want to analyze Djokovic's data and use ML to predict some stats. We choose Djokovic because he has the highest number of played matches (= more data) 

We start by importing some general modules. More modules will be imported throurough the project, when needed.

In [None]:
import sqlite3
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

Let us query all Djokovic's matches from the database and let us list all the features with the corresponding type. Let us then visualize the dataset.

In [None]:
conn = sqlite3.connect('../data.db')
df = pd.read_sql_query("SELECT * FROM djokovic ORDER BY win", conn) # select all Djokovic's matches
#print(df.columns)
print(df.dtypes)

In [None]:
from IPython.display import display, HTML

# Generate HTML table
def display_scrollable_table(df):
    html = f"""
    <style>
        .dataframe-container {{
            max-height: 500px;
            overflow-y: scroll;
            border: 1px solid #ccc;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
        }}
        th, td {{
            padding: 8px;
            text-align: left;
            border-bottom: 1px solid #ddd;
        }}
        thead th {{
            position: sticky;
            top: 0;
            background: white;
            z-index: 2;
        }}
    </style>
    <div class="dataframe-container">
        {df.to_html(index=False)}
    </div>
    """
    display(HTML(html))

# Display table
display_scrollable_table(df)

Before diving into the investigation of the dataset, let us do some basic operations to better represent the data we need.

Split date into month and year

In [None]:
# Separate year from month
yyyy, mm = [[], []]
for el in df["tourney_date"]:
    yyyy.append(int(el[:4]))
    mm.append(int(el[4:6]))
df["yyyy"] = yyyy
df["mm"] = mm
df = df.drop("tourney_date", axis=1)

Remove rows with NaN output

In [None]:
df = df[df['win'].notna()]

Stats normalization from absolute counts to relative percentage

In [None]:
cols_to_norm = ['ace', 'df', 'firstWon', 'firstIn']
for col in cols_to_norm:
    df[col + "_rate"] = df[col] / df.serve_points

cols_to_norm = ['opponent_ace', 'opponent_df', 'opponent_firstWon', 'opponent_firstIn']
for col in cols_to_norm:
    df[col + "_rate"] = df[col] / df.opponent_serve_points

cols_to_norm = ['minutes', 'tot_games', 'serve_games', 'serve_points', 'opponent_serve_games', 'opponent_serve_points']
for col in cols_to_norm:
    df[col + "_per_set"] = df[col] / df.best_of

Encode victory (yes/no) into number (1/0)

In [None]:
encoded_y = []
for el in df['win']:
    if "yes" == el:
        encoded_y.append(0.)
    else:
        encoded_y.append(1.)
y = pd.DataFrame({"win": encoded_y}) # dependent variable 
df["win"] = encoded_y

## Data exploration

Long-tailed distributions are typically not very nice to work with. Hence, we measure the kurtosis of each distribution and if it is larger than 3 stddev, we work on it.

In [None]:
from scipy.stats import kurtosis
from sklearn.impute import SimpleImputer

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

longtail_features = []

count = 0
for feature in df.select_dtypes(include=['int64', 'float64']).columns:
    df[feature] = df[feature].fillna(value=df[feature].mean())
    kurt = kurtosis(df[feature])
    if kurt > 3:
        print(feature, kurt)
        longtail_features.append(feature)
        plt.hist(df[feature])
        plt.title(feature)
        plt.show()

Correlation matrix

In [None]:
print(df.columns)
corr_matrix = df.select_dtypes(include="float64").corr()
corr_matrix.style.background_gradient(cmap='coolwarm')

In [None]:
top4 = corr_matrix["win"].sort_values(ascending=False)[:4]
print(top4.index)

In [None]:
from sklearn.linear_model import LogisticRegression

win_data = df[df["win"] == 1]
lose_data = df[df["win"] == 0]

f1 = "firstIn_rate"
f2 = "firstWon_rate"

# Visualizzazione dei dati
plt.plot(win_data[f1], win_data[f2], "bo", label="win")
plt.plot(lose_data[f1], lose_data[f2], "rx", label="lose")

xvals = np.linspace(0.5, 0.8, 100)
m = (0.35 - 0.66) / (0.5 - 0.8)
q = 0.35 - m * 0.5
yvals = m * xvals + q
plt.plot(xvals, yvals, "g-", label="decision boundary")

plt.xlabel(f1)
plt.ylabel(f2) 
plt.legend()
plt.show()

## Data engineering

The first thing we do is to encode the tournament round into numbers, to create a hierarcy

In [None]:

print(df["round"].value_counts())
for el in ["Q1", "Q2", "Q3", "BR"]:
    df.drop(df[df["round"] == el].index, inplace=True)
print(df["round"].value_counts())
round_dict = {"R128": 0, "R64": 1, "R32": 2, "R32": 3, "R16": 4, "RR": 5, "QF": 6, "SF": 7, "F": 8}
df["round_n"] = [round_dict[round_key] for round_key in df["round"]]
df.drop("round", inplace=True, axis=1)


Divide ranks into bins

In [None]:
edges = [0,1,2,3,6,10,20,50,100,1000]
df['rank'] = pd.cut(df['rank'], bins=edges, labels=range(len(edges)-1)).astype(str)
df['opponent_rank'] = pd.cut(df['opponent_rank'], bins=edges, labels=range(len(edges)-1)).astype(str)

We have to encode the date in some way. Here I chose to store only the year and the month of the match, and I encoded the month as a cyclical feature (December close to January). Other choices are possibile: for example one could consider one unique monotonic feature such as date = year*12 + month

In [None]:
from feature_engine.creation import CyclicalFeatures
df_date = pd.DataFrame({"year": df["yyyy"], "month": df["mm"]})
# Convert to cyclical feature
cyclical = CyclicalFeatures(variables=None, drop_original=True)
df_date = cyclical.fit_transform(df_date)
# Merge into dataframe
df = df.drop(columns=["yyyy", "mm"], axis=1)
df = pd.concat([df, df_date], axis=1)

Now we want to create data pipelines. I decided to make three different pipelines: A) numerical data, B) categorical data  
- numerical data -> replace NaN with mean value -> scale to zero mean unit variance distribution  
- categorical data -> replace NaN with category "missing" -> onehot encoding

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
from sklearn.compose import ColumnTransformer


# Split data into categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
numerical_columns = df.select_dtypes(include=['int', 'float']).columns

# Remove target variable from categorical columns
numerical_columns = numerical_columns.drop('win')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

y = df["win"]
X = df.drop('win', axis=1)

print(df["win"].value_counts())
print(df["win"].isna().sum())

X_preprocessed = pipeline.fit_transform(X)
print(y.value_counts())

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    'SVM': SVC(random_state=29),
    'Logistic': LogisticRegression(random_state=4),
    'Tree': DecisionTreeClassifier(random_state=99),
    'Forest': RandomForestClassifier(random_state=42)
}

# Define the hyperparameter grids for each model
param_grids = {
    'SVM': {
        'kernel' : ['linear', 'poly', 'sigmoid']
    },
    'Logistic': {
        'solver': ['liblinear', 'newton-cholesky', 'saga'],
    },
    'Tree': {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': [30, 50, None]
    },
    'Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [70, 50, None]
    }
}

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [None]:
# Train and tune the models
from sklearn.model_selection import GridSearchCV

grids = {}
for model_name, model in models.items():
    print(f'Training and tuning {model_name}...')
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='accuracy', n_jobs=-1, verbose=1)
    grids[model_name].fit(X_train, y_train.values.ravel())
    best_params = grids[model_name].best_params_
    best_score = grids[model_name].best_score_
    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best accuracy for {model_name}: {best_score}\n')

counts = y_test.value_counts()
print(counts[1] / (counts[0] + counts[1]))

In [None]:
for model_name, model in models.items():
    print(f'Testing {model_name}...')
    sc = grids[model_name].score(X_test, y_test.values.ravel())
    print(sc)

In [None]:
grids = {}
for model_name, model in models.items():
    print(f'Training and tuning {model_name}...')
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='precision', n_jobs=-1, verbose=1)
    grids[model_name].fit(X_train, y_train.values.ravel())
    best_params = grids[model_name].best_params_
    best_score = grids[model_name].best_score_
    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best accuracy for {model_name}: {best_score}\n')

counts = y_test.value_counts()
print(counts[1] / (counts[0] + counts[1]))

In [None]:
for model_name, model in models.items():
    print(f'Testing {model_name}...')
    sc = grids[model_name].best_estimator_.score(X_test, y_test.values.ravel())
    print(sc)

In [None]:
print(grids["Forest"].best_estimator_.feature_importances_)
print(X.columns)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', grids['Forest'].best_estimator_) 
])

In [None]:
import pickle

# save
with open('model.pkl','wb') as f:
    pickle.dump(pipeline, f)