# Data Wrangling

### Consideraciones

#### Borrar columnas - variables

    Unnamed: 33_level_0_Matches
    Unnamed: 24_level_0_Matches
    Unnamed: 0_level_0_Player   - Squad Total
                                - Opponent Total
                                
#### Seleccionar:
        
        Season: 2022-2023
        
        League: 'Bundesliga', 'EPL', 'LaLiga', 'Ligue1', 'SeriaA'
    

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

ruta_archivo = '../DataSets/player_stats/standard_stats.csv'

df = pd.read_csv(ruta_archivo, delimiter=',')

# Seleccionar la temporada 2022-2023
df = df.loc[df['Season'] == '2022-2023']

# Seleccionar las ligas
df = df.loc[df['League'].isin(['Bundesliga', 'EPL', 'LaLiga', 'Ligue1', 'SeriaA'])]

# Eliminar las columnas
df = df.drop(columns=['Unnamed: 33_level_0_Matches', 'Unnamed: 24_level_0_Matches', 'Unnamed: 4_level_0_MP', 'Playing Time_MP'])

# Eliminar las filas
df = df.loc[df['Unnamed: 0_level_0_Player'] != "Squad Total"]
df = df.loc[df['Unnamed: 0_level_0_Player'] != "Opponent Total"]

# Obtener información sobre el conjunto de datos
df.info()

# Calcular estadísticas descriptivas
df.describe()

# Ver las primeras filas del dataset
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3361 entries, 91649 to 95203
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Season                     3361 non-null   object 
 1   League                     3361 non-null   object 
 2   Team                       3361 non-null   object 
 3   Unnamed: 0_level_0_Player  3361 non-null   object 
 4   Unnamed: 1_level_0_Nation  3358 non-null   object 
 5   Unnamed: 2_level_0_Pos     3361 non-null   object 
 6   Unnamed: 3_level_0_Age     3359 non-null   object 
 7   Playing Time_Starts        3361 non-null   float64
 8   Playing Time_Min           2758 non-null   float64
 9   Playing Time_90s           2761 non-null   float64
 10  Performance_Gls            2761 non-null   float64
 11  Performance_Ast            2761 non-null   float64
 12  Performance_G+A            2761 non-null   float64
 13  Performance_G-PK           2761 non-null   

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Season,League,Team,Unnamed: 0_level_0_Player,Unnamed: 1_level_0_Nation,Unnamed: 2_level_0_Pos,Unnamed: 3_level_0_Age,Playing Time_Starts,Playing Time_Min,Playing Time_90s,...,Per 90 Minutes_G+A-PK,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
91649,2022-2023,EPL,Arsenal,Gabriel Dos Santos,br BRA,DF,25-084,27.0,2430.0,27.0,...,0.11,3.5,3.5,0.2,3.6,0.13,0.01,0.14,0.13,0.14
91650,2022-2023,EPL,Arsenal,Aaron Ramsdale,eng ENG,GK,24-303,27.0,2430.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91651,2022-2023,EPL,Arsenal,William Saliba,fr FRA,DF,21-354,27.0,2415.0,26.8,...,0.11,0.9,0.9,1.1,2.0,0.03,0.04,0.08,0.03,0.08
91652,2022-2023,EPL,Arsenal,Bukayo Saka,eng ENG,FW,21-189,27.0,2313.0,25.7,...,0.66,8.7,7.3,6.1,13.4,0.35,0.24,0.59,0.29,0.54
91653,2022-2023,EPL,Arsenal,Martin Ødegaard,no NOR,MF,24-086,26.0,2216.0,24.6,...,0.65,7.0,7.0,6.3,13.4,0.3,0.27,0.57,0.3,0.57


#### Cambiar Valores
    
        David Ozoh:     Unnamed: 1_level_0_Nation: 'eng ENG'


#### Borrar Filas

        Alphadjo Cissè
        Matteo Fiorenza
        Sammy Braybroke
        
        Eliminar las filas con Playing Time_Min = NaN
        
         Eliminar las filas con Progression_PrgC = NaN


In [56]:
# Cambiar el valor de la celda en la fila 386 y columna "Nation"
df.loc[92057, 'Unnamed: 1_level_0_Nation'] = 'en ENG'

In [57]:
# Eliminar las filas con Playing Time_Min = NaN
df = df.dropna(subset=['Playing Time_Min'])

# Eliminar las filas con Progression_PrgC = NaN
df = df.dropna(subset=['Progression_PrgC'])

# Obtener información sobre el conjunto de datos
df.info()

# Calcular estadísticas descriptivas
df.describe()

# Ver las primeras filas del dataset
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2745 entries, 91649 to 95194
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Season                     2745 non-null   object 
 1   League                     2745 non-null   object 
 2   Team                       2745 non-null   object 
 3   Unnamed: 0_level_0_Player  2745 non-null   object 
 4   Unnamed: 1_level_0_Nation  2745 non-null   object 
 5   Unnamed: 2_level_0_Pos     2745 non-null   object 
 6   Unnamed: 3_level_0_Age     2745 non-null   object 
 7   Playing Time_Starts        2745 non-null   float64
 8   Playing Time_Min           2745 non-null   float64
 9   Playing Time_90s           2745 non-null   float64
 10  Performance_Gls            2745 non-null   float64
 11  Performance_Ast            2745 non-null   float64
 12  Performance_G+A            2745 non-null   float64
 13  Performance_G-PK           2745 non-null   

Unnamed: 0,Season,League,Team,Unnamed: 0_level_0_Player,Unnamed: 1_level_0_Nation,Unnamed: 2_level_0_Pos,Unnamed: 3_level_0_Age,Playing Time_Starts,Playing Time_Min,Playing Time_90s,...,Per 90 Minutes_G+A-PK,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
91649,2022-2023,EPL,Arsenal,Gabriel Dos Santos,br BRA,DF,25-084,27.0,2430.0,27.0,...,0.11,3.5,3.5,0.2,3.6,0.13,0.01,0.14,0.13,0.14
91650,2022-2023,EPL,Arsenal,Aaron Ramsdale,eng ENG,GK,24-303,27.0,2430.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91651,2022-2023,EPL,Arsenal,William Saliba,fr FRA,DF,21-354,27.0,2415.0,26.8,...,0.11,0.9,0.9,1.1,2.0,0.03,0.04,0.08,0.03,0.08
91652,2022-2023,EPL,Arsenal,Bukayo Saka,eng ENG,FW,21-189,27.0,2313.0,25.7,...,0.66,8.7,7.3,6.1,13.4,0.35,0.24,0.59,0.29,0.54
91653,2022-2023,EPL,Arsenal,Martin Ødegaard,no NOR,MF,24-086,26.0,2216.0,24.6,...,0.65,7.0,7.0,6.3,13.4,0.3,0.27,0.57,0.3,0.57


In [59]:
# Cambiar los datos de la columna Unnamed: 3_level_0_Age
df['Unnamed: 3_level_0_Age'] = df['Unnamed: 3_level_0_Age'].str[:2].astype('float64')

# Obtener información sobre el conjunto de datos
df.info()

# Calcular estadísticas descriptivas
df.describe()

# Ver las primeras filas del dataset
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2745 entries, 91649 to 95194
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Season                     2745 non-null   object 
 1   League                     2745 non-null   object 
 2   Team                       2745 non-null   object 
 3   Unnamed: 0_level_0_Player  2745 non-null   object 
 4   Unnamed: 1_level_0_Nation  2745 non-null   object 
 5   Unnamed: 2_level_0_Pos     2745 non-null   object 
 6   Unnamed: 3_level_0_Age     2745 non-null   float64
 7   Playing Time_Starts        2745 non-null   float64
 8   Playing Time_Min           2745 non-null   float64
 9   Playing Time_90s           2745 non-null   float64
 10  Performance_Gls            2745 non-null   float64
 11  Performance_Ast            2745 non-null   float64
 12  Performance_G+A            2745 non-null   float64
 13  Performance_G-PK           2745 non-null   

Unnamed: 0,Season,League,Team,Unnamed: 0_level_0_Player,Unnamed: 1_level_0_Nation,Unnamed: 2_level_0_Pos,Unnamed: 3_level_0_Age,Playing Time_Starts,Playing Time_Min,Playing Time_90s,...,Per 90 Minutes_G+A-PK,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
91649,2022-2023,EPL,Arsenal,Gabriel Dos Santos,br BRA,DF,25.0,27.0,2430.0,27.0,...,0.11,3.5,3.5,0.2,3.6,0.13,0.01,0.14,0.13,0.14
91650,2022-2023,EPL,Arsenal,Aaron Ramsdale,eng ENG,GK,24.0,27.0,2430.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91651,2022-2023,EPL,Arsenal,William Saliba,fr FRA,DF,21.0,27.0,2415.0,26.8,...,0.11,0.9,0.9,1.1,2.0,0.03,0.04,0.08,0.03,0.08
91652,2022-2023,EPL,Arsenal,Bukayo Saka,eng ENG,FW,21.0,27.0,2313.0,25.7,...,0.66,8.7,7.3,6.1,13.4,0.35,0.24,0.59,0.29,0.54
91653,2022-2023,EPL,Arsenal,Martin Ødegaard,no NOR,MF,24.0,26.0,2216.0,24.6,...,0.65,7.0,7.0,6.3,13.4,0.3,0.27,0.57,0.3,0.57


## Guardar los datos en un archivo nuevo CSV

In [60]:
df.to_csv("../DataSets/player_stats/standard_stats_Corr.csv")

## Extraer los datos de nuevo archivo

In [61]:
ruta_archivo = '../DataSets/player_stats/standard_stats_Corr.csv'

df = pd.read_csv(ruta_archivo, delimiter=',')

# Obtener información sobre el conjunto de datos
df.info()

# Calcular estadísticas descriptivas
df.describe()

# Ver las primeras filas del dataset
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2745 entries, 0 to 2744
Data columns (total 36 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 2745 non-null   int64  
 1   Season                     2745 non-null   object 
 2   League                     2745 non-null   object 
 3   Team                       2745 non-null   object 
 4   Unnamed: 0_level_0_Player  2745 non-null   object 
 5   Unnamed: 1_level_0_Nation  2745 non-null   object 
 6   Unnamed: 2_level_0_Pos     2745 non-null   object 
 7   Unnamed: 3_level_0_Age     2745 non-null   float64
 8   Playing Time_Starts        2745 non-null   float64
 9   Playing Time_Min           2745 non-null   float64
 10  Playing Time_90s           2745 non-null   float64
 11  Performance_Gls            2745 non-null   float64
 12  Performance_Ast            2745 non-null   float64
 13  Performance_G+A            2745 non-null   float

Unnamed: 0.1,Unnamed: 0,Season,League,Team,Unnamed: 0_level_0_Player,Unnamed: 1_level_0_Nation,Unnamed: 2_level_0_Pos,Unnamed: 3_level_0_Age,Playing Time_Starts,Playing Time_Min,...,Per 90 Minutes_G+A-PK,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
0,91649,2022-2023,EPL,Arsenal,Gabriel Dos Santos,br BRA,DF,25.0,27.0,2430.0,...,0.11,3.5,3.5,0.2,3.6,0.13,0.01,0.14,0.13,0.14
1,91650,2022-2023,EPL,Arsenal,Aaron Ramsdale,eng ENG,GK,24.0,27.0,2430.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,91651,2022-2023,EPL,Arsenal,William Saliba,fr FRA,DF,21.0,27.0,2415.0,...,0.11,0.9,0.9,1.1,2.0,0.03,0.04,0.08,0.03,0.08
3,91652,2022-2023,EPL,Arsenal,Bukayo Saka,eng ENG,FW,21.0,27.0,2313.0,...,0.66,8.7,7.3,6.1,13.4,0.35,0.24,0.59,0.29,0.54
4,91653,2022-2023,EPL,Arsenal,Martin Ødegaard,no NOR,MF,24.0,26.0,2216.0,...,0.65,7.0,7.0,6.3,13.4,0.3,0.27,0.57,0.3,0.57
