# Dependencias

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.cluster.hierarchy import dendrogram, linkage
from pathlib import Path

# Constantes

In [None]:
PATH_PROJECT_DATA = Path('../data')
PATH_PROJECT_MODELS = Path('../objects')

# Lectura de los datos

In [None]:
df_players_cleaned = pd.read_csv(PATH_PROJECT_DATA / 'players_cleaned.csv')
df_matches = pd.read_csv(PATH_PROJECT_DATA / 'matches_grand_slam_cleaned.csv')

# Transformamos el target a numerico

In [None]:
df_players_cleaned['player_victory'] = (
    pd.get_dummies(df_players_cleaned['player_victory'],
                   drop_first=True)
)

# Creamos la agrupación de los jugadores con sus estadisticas acumuladas

Dataframe a nivel de jugador por cada partido distinto, acumulando las estadisticas previas a ese partido

In [None]:
df_players_cleaned_grouped = pd.concat([
    (df_players_cleaned[['player_name', 'year',
                         'start_date', 'round_num']]
     .sort_values(['year', 'start_date', 'round_num'])
     .drop_duplicates(subset=['player_name', 'year', 'start_date', 'round_num'],
                      keep='first')),
    (df_players_cleaned
     .drop(columns=['tournament', 'seed', 'retirement'])
     .sort_values(['year', 'start_date', 'round_num'])
     .drop_duplicates(subset=['player_name', 'year', 'start_date', 'round_num'],
                      keep='first')
     .drop(columns=['year', 'start_date', 'round_num'])
     .groupby('player_name')
     .transform('cumsum')
     - df_players_cleaned.drop(columns=['player_name', 'year',
                                        'tournament', 'seed',
                                        'start_date','round_num',
                                        'retirement']))
], axis=1)

# Tratamos un poco los partidos de partidos

In [None]:
df_matches_subset = (df_matches[['court_surface', 'year',
                                 'tournament', 'start_date',
                                 'round', 'round_num',
                                 'player_1', 'player_2',
                                 'player_victory_1']]
                     .rename(columns=dict(player_victory_1='player_victory_target')))

df_matches_subset['player_victory_target'] = (
    pd.get_dummies(df_matches_subset['player_victory_target'],
                   drop_first=True)
)

In [None]:
original_columns = df_players_cleaned_grouped.columns

df_players_cleaned_grouped.columns = original_columns + '_player_1'
df_matches_subset_joined_player_1 = (
    df_matches_subset
    .merge(df_players_cleaned_grouped,
           left_on=['player_1', 'year', 'start_date', 'round_num'],
           right_on=['player_name_player_1', 'year_player_1',
                     'start_date_player_1', 'round_num_player_1'],
           how='inner')
    .drop(columns=['player_name_player_1', 'year_player_1',
                   'start_date_player_1', 'round_num_player_1'])
)

df_players_cleaned_grouped.columns = original_columns + '_player_2'
df_matches_subset_joined = (
    df_matches_subset_joined_player_1
    .merge(df_players_cleaned_grouped,
           left_on=['player_2', 'year', 'start_date', 'round_num'],
           right_on=['player_name_player_2', 'year_player_2',
                     'start_date_player_2', 'round_num_player_2'],
           how='inner')
    .drop(columns=['player_name_player_2', 'year_player_2',
                   'start_date_player_2', 'round_num_player_2'])
)

df_players_cleaned_grouped.columns = original_columns

In [None]:
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import train_test_split


X = df_matches_subset_joined.drop(columns=['player_1',
                                           'player_2',
                                           'start_date',
                                           'year',
                                           'round',
                                           'court_surface',
                                           'tournament',
                                           'player_victory_target'])

y = df_matches_subset_joined['player_victory_target']

train_features, test_features, train_labels, test_labels = (
    train_test_split(X, y, test_size = 0.25, random_state = 42)
)

params_xgb = dict(num_round=200,
                  max_depth=4,
                  random_state=0,
                  reg_lambda=15,
                  min_child_weight=10,
                  objective='binary:logistic',
                  colsample_bytree=.8,
                  feature_names=train_features.columns)

xgb = XGBClassifier(**params_xgb)

xgb.fit(train_features, train_labels)

plot_importance(xgb, max_num_features=10)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score


xgb_train_preds_proba = xgb.predict_proba(train_features)[:, 1]
xgb_test_preds_proba = xgb.predict_proba(test_features)[:, 1]

print('AUC:',
      f'TRAIN: {roc_auc_score(train_labels, xgb_train_preds_proba):.2f}',
      f'TEST: {roc_auc_score(test_labels, xgb_test_preds_proba):.2f}',
      sep='\n')

xgb_train_preds = xgb.predict(train_features)
xgb_test_preds = xgb.predict(test_features)

print("PORCENTAJE DE PARTIDOS QUE ACIERTA:",
      f'TRAIN: {accuracy_score(train_labels, xgb_train_preds) * 100:.2f}%',
      f'TEST: {accuracy_score(test_labels, xgb_test_preds) * 100:.2f}%',
      sep='\n')

# Reducimos la dimensión del dataset creando las variables diferencia (resta de cada una de las variables)

In [None]:
stats_player_1 = [column
                  for column in df_matches_subset_joined.columns
                  if column.endswith('_1') and column != 'player_1']

stats_player_2 = [column
                  for column in df_matches_subset_joined.columns
                  if column.endswith('_2') and column != 'player_2']

stats_columns = stats_player_1 + stats_player_2

columns_to_subtract = zip(stats_player_1, stats_player_2)

df_matches_subset_joined_subtract = df_matches_subset_joined.copy()

for column_1, column_2 in columns_to_subtract:
    df_matches_subset_joined_subtract[column_1.replace('_player_1', '')] = (
      df_matches_subset_joined[column_1]
      - df_matches_subset_joined[column_2]
  )

df_matches_subset_joined_subtract = (
    df_matches_subset_joined_subtract
    .drop(columns=stats_columns)
)

In [None]:
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import train_test_split


X = df_matches_subset_joined_subtract.drop(columns=['player_1',
                                                    'player_2',
                                                    'start_date',
                                                    'year',
                                                    'round',
                                                    'court_surface',
                                                    'tournament',
                                                    'player_victory_target'])

y = df_matches_subset_joined_subtract['player_victory_target']

train_features, test_features, train_labels, test_labels = (
    train_test_split(X, y, test_size = 0.25, random_state = 42)
)

params_xgb = dict(num_round=200,
                  max_depth=4,
                  random_state=0,
                  reg_lambda=15,
                  min_child_weight=10,
                  objective='binary:logistic',
                  colsample_bytree=.8,
                  feature_names=train_features.columns)

xgb = XGBClassifier(**params_xgb)

xgb.fit(train_features, train_labels)

plot_importance(xgb, max_num_features=10)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score


xgb_train_preds_proba = xgb.predict_proba(train_features)[:, 1]
xgb_test_preds_proba = xgb.predict_proba(test_features)[:, 1]

print('AUC:',
      f'TRAIN: {roc_auc_score(train_labels, xgb_train_preds_proba):.2f}',
      f'TEST: {roc_auc_score(test_labels, xgb_test_preds_proba):.2f}',
      sep='\n')

xgb_train_preds = xgb.predict(train_features)
xgb_test_preds = xgb.predict(test_features)

print("PORCENTAJE DE PARTIDOS QUE ACIERTA:",
      f'TRAIN: {accuracy_score(train_labels, xgb_train_preds) * 100:.2f}%',
      f'TEST: {accuracy_score(test_labels, xgb_test_preds) * 100:.2f}%',
      sep='\n')

# Pickle para hacer el deploy en la app de Flask

In [None]:
import pickle


with open(PATH_PROJECT_MODELS / 'model.pkl', 'wb') as file:
    pickle.dump(xgb, file) 

# Guardamos los datos para las predicciones en la interfaz

Últimas estadísticas de cada jugador

In [None]:
features_model = [
  'player_name', 'sets_won', 'games_won', 'tiebreaks_won', 'serve_rating',
  'aces', 'double_faults', 'first_serve_made', 'first_serve_attempted',
  'first_serve_points_made', 'first_serve_points_attempted',
  'second_serve_points_made', 'second_serve_points_attempted',
  'break_points_saved', 'break_points_against', 'service_games_won',
  'return_rating', 'first_serve_return_points_made',
  'first_serve_return_points_attempted',
  'second_serve_return_points_made',
  'second_serve_return_points_attempted', 'break_points_made',
  'break_points_attempted', 'return_games_played', 'service_points_won',
  'service_points_attempted', 'return_points_won',
  'return_points_attempted', 'total_points_won', 'player_victory'
]

(df_players_cleaned_grouped[features_model]
 .groupby('player_name')
 .max()
 .reset_index()
 .to_pickle(PATH_PROJECT_MODELS / 'data.pkl'))

# Interpretabilidad de las predicciones

[Shap repo](https://github.com/slundberg/shap)

![](https://raw.githubusercontent.com/slundberg/shap/master/docs/artwork/shap_header.png)

In [None]:
import xgboost
import shap

# load JS visualization code to notebook
shap.initjs()

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X)

index_final_nadal_federer = (
    df_matches_subset_joined_subtract[
      (df_matches_subset_joined_subtract.tournament == 'wimbledon')
       & (df_matches_subset_joined_subtract.round_num == 7)
       & (df_matches_subset_joined_subtract.year == 2008)].index
)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value,
                shap_values[index_final_nadal_federer,:],
                X.iloc[index_final_nadal_federer,:])

## Los shap values se transforman a probabilidades aplicandoles una sigmoide

In [None]:
import math


def sigmoid(x):
    return 1 / (1 + math.exp(-x))


sigmoid(-0.61), xgb.predict_proba(X.iloc[index_final_nadal_federer,:])[0, 1]

# Top 3 variables que más influyen en una predicción específica

In [None]:
top_features = [feature
                for feature, _ in sorted(
                                    dict(
                                        zip(X.columns,
                                            explainer
                                            .shap_values(
                                                X.iloc[index_final_nadal_federer]
                                        )[0]))
                                    .items(),
                                    key=lambda x: x[1],
                                    reverse=True)[:3]]

top_features

## pickle del shap explainer

In [None]:
with open(PATH_PROJECT_MODELS / 'shap_explainer.pkl', 'wb') as file:
    pickle.dump(explainer, file)