## Neural Network Model to Predict Tennis Outcome, Model 3

### Read in csv data

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('../Resources/matches_cleaned.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_name,...,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score,winner
0,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209098.0,6.0,Hamad Medjedovic,...,4.0,4.0,110.0,582.0,3.0,4.0,4.0,3.0,4.0,1
1,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209950.0,1.0,Arthur Fils,...,1.0,4.0,36.0,1158.0,4.0,1.0,2.0,4.0,1.0,0
2,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,299,209950.0,1.0,Arthur Fils,...,2.0,3.0,36.0,1158.0,2.0,4.0,4.0,4.0,,1
3,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,299,209414.0,2.0,Luca Van Assche,...,2.0,3.0,70.0,756.0,4.0,1.0,3.0,3.0,,0
4,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,298,209098.0,6.0,Hamad Medjedovic,...,0.0,0.0,110.0,582.0,4.0,2.0,,,,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76434 entries, 0 to 76433
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          76434 non-null  object 
 1   tourney_name        76434 non-null  object 
 2   surface             76434 non-null  object 
 3   draw_size           76434 non-null  int64  
 4   tourney_level       76434 non-null  object 
 5   tourney_date        76434 non-null  object 
 6   match_num           76434 non-null  int64  
 7   player_id           76434 non-null  float64
 8   player_seed         27936 non-null  float64
 9   player_name         76434 non-null  object 
 10  player_hand         76434 non-null  object 
 11  player_ht           75798 non-null  float64
 12  player_ioc          76434 non-null  object 
 13  player_age          76432 non-null  float64
 14  score               76434 non-null  object 
 15  best_of             76434 non-null  int64  
 16  roun

### Select Features and Target(s)

In [4]:
#if we want to restrict features columns later: 

# %% OPTIMIZATION AND EVALUATION %%
# Dropping even more columns: best_of and round


# next time we can try dropping a lot of the game numerical data and seeing what happens to accuracy

#drop unwanted columns
columns_to_drop = ['tourney_id', 'tourney_date', 'tourney_name', 'match_num', 'score', 'player_name', 'player_ioc', 'best_of', 'round']
df.drop(columns=columns_to_drop, inplace=True)

In [5]:
X = df.copy()
X.drop('winner', axis=1, inplace=True)

X.head()

Unnamed: 0,surface,draw_size,tourney_level,player_id,player_seed,player_hand,player_ht,player_age,minutes,player_ace,...,player_SvGms,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score
0,Hard,8,F,209098.0,6.0,R,185.0,20.3,131.0,19.0,...,15.0,4.0,4.0,110.0,582.0,3.0,4.0,4.0,3.0,4.0
1,Hard,8,F,209950.0,1.0,R,185.0,19.4,131.0,11.0,...,13.0,1.0,4.0,36.0,1158.0,4.0,1.0,2.0,4.0,1.0
2,Hard,8,F,209950.0,1.0,R,185.0,19.4,97.0,9.0,...,12.0,2.0,3.0,36.0,1158.0,2.0,4.0,4.0,4.0,
3,Hard,8,F,209414.0,2.0,R,,19.5,97.0,6.0,...,11.0,2.0,3.0,70.0,756.0,4.0,1.0,3.0,3.0,
4,Hard,8,F,209098.0,6.0,R,185.0,20.3,32.0,5.0,...,4.0,0.0,0.0,110.0,582.0,4.0,2.0,,,


In [6]:
y = df['winner']
y.head()

0    1
1    0
2    1
3    0
4    1
Name: winner, dtype: int64

### Set Up Preprocessing Pipeline

Potential optimization steps: can try to not standardize the different set scores, because those should be really strong indicators. 

Target column is just a 1/0  at a 50/50 split by definition so it shouldn't need standardization - it makes no sense to convert something binary into something continuous. 

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [8]:
categorical_columns = ['surface', 'tourney_level', 'player_hand' ]  
numerical_columns = X.columns.difference(categorical_columns)

In [9]:
#potential to change/drop set columns later/modify imputing strategy for optimization in the future

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values for numerical columns
            ('scaler', StandardScaler())  # Standardize numerical columns
        ]), numerical_columns),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values for categorical columns
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical columns
        ]), categorical_columns)
    ])

### Confirm Separation of Dataframe into Features/Target

In [10]:
from sklearn.model_selection import train_test_split


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Run Preprocessor Fit/Transform 

In [12]:

# Fit and transform the training data, transform the testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

### Creating the Model

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [14]:
#note I had to pip install scikeras 

from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV


In [15]:
# Define the neural network model
#room for optimization here - GridSearchCV should help us find the optimal parameters. Or we can use keras-tuner

def create_model(optimizer='adam', dropout_rate=0.5):
    model = Sequential()
    model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [16]:
# Use KerasClassifier wrapper
model = KerasClassifier(build_fn=create_model, verbose=0)

# Training and Evaluating the Model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
score = model.score(X_test, y_test)
print(f"Test accuracy: {score}")

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Test accuracy: 0.9795250866749525


### Evaluate Model

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [18]:
# Make predictions
predictions = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability estimates for ROC AUC


In [19]:

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
roc_auc = roc_auc_score(y_test, y_pred_proba)
class_report = classification_report(y_test, predictions)


print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'\nClassification Report:\n{classification_report(y_test, predictions)}')
print(f"ROC AUC Score: {roc_auc}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
Confusion Matrix:
[[7404  165]
 [ 148 7570]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7569
           1       0.98      0.98      0.98      7718

    accuracy                           0.98     15287
   macro avg       0.98      0.98      0.98     15287
weighted avg       0.98      0.98      0.98     15287

ROC AUC Score: 0.9981660474519793


### Hyperparameter Tuning

In [20]:
# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'model__optimizer': ['adam', 'rmsprop'],
    'model__dropout_rate': [0.5],
    'epochs': [100],
    'batch_size': [32]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# Best model and parameters
best_model = grid_result.best_estimator_
best_params = grid_result.best_params_
print(f"Best parameters: {best_params}")

# Evaluate the best model on the test set
best_score = best_model.score(X_test, y_test)
print(f"Best model test accuracy: {best_score}")

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best parameters: {'batch_size': 32, 'epochs': 100, 'model__dropout_rate': 0.5, 'model__optimizer': 'adam'}
Best model test accuracy: 0.9803100673775103


### Best Model Evaluation

In [None]:
#an improvement over model 1, almost 1 percent