<a href="https://colab.research.google.com/github/JoaquinJustelP/Python_UB_2024/blob/main/Pandas_snippets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pandas Snippets

Author: Sebastian Raschka


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Cargamos un dataset de ejemplo desde una URL.

df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df.head()

Unnamed: 0,PLAYER,SALARY,GP,G,A,SOT,PPG,P
0,Sergio Agüero\n Forward — Manchester City,$19.2m,16.0,14,3.0,34,13.12,209.98
1,Eden Hazard\n Midfield — Chelsea,$18.9m,21.0,8,4.0,17,13.05,274.04
2,Alexis Sánchez\n Forward — Arsenal,$17.6m,,12,7.0,29,11.19,223.86
3,Yaya Touré\n Midfield — Manchester City,$16.6m,18.0,7,1.0,19,10.99,197.91
4,Ángel Di María\n Midfield — Manchester United,$15.0m,13.0,3,,13,10.17,132.23


In [None]:
# Normalizamos los nombres de las columnas a minúsculas.
# Esto facilita el acceso y evita errores por mayúsculas/minúsculas.

df.columns = [c.lower() for c in df.columns]

# Alternativamente:
# df.rename(columns=lambda x : x.lower())

df.tail(3)

Unnamed: 0,player,salary,gp,g,a,sot,ppg,p
7,Cesc Fàbregas\n Midfield — Chelsea,$14.0m,20.0,2,14.0,10,10.47,209.49
8,Saido Berahino\n Forward — West Brom,$13.8m,21.0,9,0.0,20,7.02,147.43
9,Steven Gerrard\n Midfield — Liverpool,$13.8m,20.0,5,1.0,11,7.5,150.01


In [None]:
# Renombramos columnas para que sean más descriptivas y fáciles de usar.
# Es una buena práctica para mejorar la legibilidad del código y los análisis posteriores.


df = df.rename(columns={'p': 'points',
                        'gp': 'games',
                        'sot': 'shots_on_target',
                        'g': 'goals',
                        'ppg': 'points_per_game',
                        'a': 'assists',})

df.tail(3)

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points
7,Cesc Fàbregas\n Midfield — Chelsea,$14.0m,20.0,2,14.0,10,10.47,209.49
8,Saido Berahino\n Forward — West Brom,$13.8m,21.0,9,0.0,20,7.02,147.43
9,Steven Gerrard\n Midfield — Liverpool,$13.8m,20.0,5,1.0,11,7.5,150.01


In [None]:
# Limpiamos la columna 'salary' eliminando símbolos de dólar y la letra 'm'.
# Así convertimos los valores a un formato más adecuado para análisis numéricos.

df['salary'] = df['salary'].apply(lambda x: x.strip('$m'))
df.tail()

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points
5,Santiago Cazorla\n Midfield — Arsenal,14.8,20.0,4,,20,9.97,
6,David Silva\n Midfield — Manchester City,14.3,15.0,6,2.0,11,10.35,155.26
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0,2,14.0,10,10.47,209.49
8,Saido Berahino\n Forward — West Brom,13.8,21.0,9,0.0,20,7.02,147.43
9,Steven Gerrard\n Midfield — Liverpool,13.8,20.0,5,1.0,11,7.5,150.01


In [None]:
# Insertamos una nueva columna vacía llamada 'position' en la posición 8.
# Esto puede ser útil si queremos rellenarla más adelante con información procesada.

df.insert(loc=8, column='position', value='')

df.tail(3)

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,position
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0,2,14.0,10,10.47,209.49,
8,Saido Berahino\n Forward — West Brom,13.8,21.0,9,0.0,20,7.02,147.43,
9,Steven Gerrard\n Midfield — Liverpool,13.8,20.0,5,1.0,11,7.5,150.01,


In [None]:
# Comparar diferentes formas de añadir columnas vacías y medir el tiempo de ejecución

%timeit df['added1'] = pd.Series('', index=df.index)

%timeit df['added3'] = ''


107 µs ± 42 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
42.4 µs ± 7.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
df

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,position,added1,added3
0,Sergio Agüero\n Forward — Manchester City,19.2,16.0,14,3.0,34,13.12,209.98,,,
1,Eden Hazard\n Midfield — Chelsea,18.9,21.0,8,4.0,17,13.05,274.04,,,
2,Alexis Sánchez\n Forward — Arsenal,17.6,,12,7.0,29,11.19,223.86,,,
3,Yaya Touré\n Midfield — Manchester City,16.6,18.0,7,1.0,19,10.99,197.91,,,
4,Ángel Di María\n Midfield — Manchester United,15.0,13.0,3,,13,10.17,132.23,,,
5,Santiago Cazorla\n Midfield — Arsenal,14.8,20.0,4,,20,9.97,,,,
6,David Silva\n Midfield — Manchester City,14.3,15.0,6,2.0,11,10.35,155.26,,,
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0,2,14.0,10,10.47,209.49,,,
8,Saido Berahino\n Forward — West Brom,13.8,21.0,9,0.0,20,7.02,147.43,,,
9,Steven Gerrard\n Midfield — Liverpool,13.8,20.0,5,1.0,11,7.5,150.01,,,


In [None]:
# Eliminamos las columnas auxiliares añadidas anteriormente
df.drop(['added1', 'added3'], axis=1, inplace=True)

In [None]:
# Procesamos la columna 'player' para extraer nombre, equipo y posición

def process_player_col(text):
    name, rest = text.split('\n')
    position, team = [x.strip() for x in rest.split(' — ')]
    return pd.Series([name, team, position])

# Aplicamos la función y creamos nuevas columnas a partir de 'player'

df[['player', 'team', 'position']] = df.player.apply(process_player_col)


df.tail(3)

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,position,team
7,Cesc Fàbregas,14.0,20.0,2,14.0,10,10.47,209.49,Midfield,Chelsea
8,Saido Berahino,13.8,21.0,9,0.0,20,7.02,147.43,Forward,West Brom
9,Steven Gerrard,13.8,20.0,5,1.0,11,7.5,150.01,Midfield,Liverpool


In [None]:
# Normalizamos a minúsculas las columnas de texto relevantes.
# Esto ayuda a evitar problemas de comparación y análisis posteriores.
cols = ['player', 'position', 'team']
df[cols] = df[cols].applymap(lambda x: x.lower())
df.head()

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,position,team
0,sergio agüero,19.2,16.0,14,3.0,34,13.12,209.98,forward,manchester city
1,eden hazard,18.9,21.0,8,4.0,17,13.05,274.04,midfield,chelsea
2,alexis sánchez,17.6,,12,7.0,29,11.19,223.86,forward,arsenal
3,yaya touré,16.6,18.0,7,1.0,19,10.99,197.91,midfield,manchester city
4,ángel di maría,15.0,13.0,3,,13,10.17,132.23,midfield,manchester united


In [None]:
# Añadir una fila de NaNs al DataFrame

df.append(pd.Series(
                [np.nan]*len(df.columns), 
                index=df.columns),
                ignore_index=True)

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,position,team
0,sergio agüero,19.2,16.0,14.0,3.0,34.0,13.12,209.98,forward,manchester city
1,eden hazard,18.9,21.0,8.0,4.0,17.0,13.05,274.04,midfield,chelsea
2,alexis sánchez,17.6,,12.0,7.0,29.0,11.19,223.86,forward,arsenal
3,yaya touré,16.6,18.0,7.0,1.0,19.0,10.99,197.91,midfield,manchester city
4,ángel di maría,15.0,13.0,3.0,,13.0,10.17,132.23,midfield,manchester united
5,santiago cazorla,14.8,20.0,4.0,,20.0,9.97,,midfield,arsenal
6,david silva,14.3,15.0,6.0,2.0,11.0,10.35,155.26,midfield,manchester city
7,cesc fàbregas,14.0,20.0,2.0,14.0,10.0,10.47,209.49,midfield,chelsea
8,saido berahino,13.8,21.0,9.0,0.0,20.0,7.02,147.43,forward,west brom
9,steven gerrard,13.8,20.0,5.0,1.0,11.0,7.5,150.01,midfield,liverpool


In [None]:
nans = [np.nan] * len(df.columns)
df.loc[-1] = nans

1.21 ms ± 84.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
df

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,position,team
0,sergio agüero,19.2,16.0,14.0,3.0,34.0,13.12,209.98,forward,manchester city
1,eden hazard,18.9,21.0,8.0,4.0,17.0,13.05,274.04,midfield,chelsea
2,alexis sánchez,17.6,,12.0,7.0,29.0,11.19,223.86,forward,arsenal
3,yaya touré,16.6,18.0,7.0,1.0,19.0,10.99,197.91,midfield,manchester city
4,ángel di maría,15.0,13.0,3.0,,13.0,10.17,132.23,midfield,manchester united
5,santiago cazorla,14.8,20.0,4.0,,20.0,9.97,,midfield,arsenal
6,david silva,14.3,15.0,6.0,2.0,11.0,10.35,155.26,midfield,manchester city
7,cesc fàbregas,14.0,20.0,2.0,14.0,10.0,10.47,209.49,midfield,chelsea
8,saido berahino,13.8,21.0,9.0,0.0,20.0,7.02,147.43,forward,west brom
9,steven gerrard,13.8,20.0,5.0,1.0,11.0,7.5,150.01,midfield,liverpool
