# Dependencias

- `pandas` para tratamiento de datos
- `seaborn` para visualización -> https://seaborn.pydata.org/api.html
- `networkx` para temas de grafos

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from pathlib import Path

# Constantes

In [None]:
PATH_PROJECT_DATA = Path('../data')
GRAND_SLAMS_AND_COLORS = {'wimbledon': 'green',
                          'roland-garros' :'orange',
                          'us-open': 'blue',
                          'australian-open': 'black'}

# Leemos los datos y hacemos la primera visualización que nos interesa

In [None]:
df_matches_grand_slam_cleaned = pd.read_csv(PATH_PROJECT_DATA
                                            / 'matches_grand_slam_cleaned.csv')

# Visualización de correlaciones

In [None]:
sns.pairplot(
    df_matches_grand_slam_cleaned[['tournament',
                                   'games_won_1',
                                   'games_won_2',
                                   'first_serve_made_1',
                                   'first_serve_made_2',
                                   'first_serve_return_points_made_1',
                                   'first_serve_return_points_made_2',
                                   'player_victory_1']])

## En este caso vemos como se relacionan los aces y los partidos ganados

In [None]:
g = sns.jointplot('aces_1',
                  'games_won_1',
                  data=df_matches_grand_slam_cleaned,
                  kind="reg")

## En la visualización se ve un outlier, por lo que estudiamos el caso y vemos que efectivamente es un partido de winbledon en 2010 entre Isner y Mahut

In [None]:
(df_matches_grand_slam_cleaned
 .loc[df_matches_grand_slam_cleaned['aces_1'] > 100,
      ['player_1', 'player_2', 'tournament', 'year']])

## Comprobamos que los partidos están balanceados entre torneos

In [None]:
df_matches_grand_slam_cleaned.tournament.value_counts()

# Comprobamos si la relacion entre aces y juegos ganados es diferente para los distintos torneos

In [None]:
for grand_slam, color in GRAND_SLAMS_AND_COLORS.items():
    df_matches_grand_slam_cleaned_filtered = (
        df_matches_grand_slam_cleaned[df_matches_grand_slam_cleaned['tournament']
                                      == grand_slam]
    )

    g = sns.jointplot(
        'aces_1',
        'games_won_1',
        data=df_matches_grand_slam_cleaned_filtered,
        color=color,
        xlim=(0, 60),
        ylim=(0, 50),
        kind="reg")

## Vemos que la pendiente de Roland Garros es mayor! Pero claro, hacen muchos menos aces, aquí debajo lo comprobamos

In [None]:
for grand_slam in GRAND_SLAMS_AND_COLORS.keys():
    aces_1 = (
        df_matches_grand_slam_cleaned
        .loc[df_matches_grand_slam_cleaned['tournament'] == grand_slam,
             'aces_1']
        .sum()
    )
  
    aces_2 = (
        df_matches_grand_slam_cleaned
        .loc[df_matches_grand_slam_cleaned['tournament'] == grand_slam,
             'aces_2']
        .sum()
    )
    print(grand_slam, aces_1 + aces_2)