# PANDAS: MULTIPLES DATAFRAMES

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
#Para mostrar multiples dataframes uno al lado del otro
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## CONCATENACIÓN

In [3]:
#Recordar: concatenación de arrays en Numpy
x=np.array([1,2,3])
y=np.array([4,5,6])
z=np.array([7,8,9])
mat=np.array([[1,2],[3,4]])

print(np.concatenate([x,y,z])) #En una dimension solo tenemos axis=0
print('-'*100)

print(np.vstack([x,y,z])) #Concatenado vertical
print('-'*100)

print(np.concatenate([mat,mat])) #Por defecto asix=0 (concatenación vertical)
print('-'*100)

print(np.concatenate([mat,mat],axis=1)) #Horizontal

[1 2 3 4 5 6 7 8 9]
----------------------------------------------------------------------------------------------------
[[1 2 3]
 [4 5 6]
 [7 8 9]]
----------------------------------------------------------------------------------------------------
[[1 2]
 [3 4]
 [1 2]
 [3 4]]
----------------------------------------------------------------------------------------------------
[[1 2 1 2]
 [3 4 3 4]]


### **pd.concat**

```python
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)
```

In [4]:
serie1=pd.Series(['A','B','C'],index=[1,2,3])
print(serie1)
print('-'*100)

serie2=pd.Series(['D','E','F'], index=[1,2,3])
print(serie2)
print('-'*100)

print(pd.concat([serie1,serie2])) #pd.concat([serie1,serie2],axis=0)
print('-'*100)

print(pd.concat([serie1,serie2]).reset_index(drop=True)) #Ya que los indices coinciden. Los reseteamos (numeros naturales desde el 0)
print('-'*100)

print(pd.concat([serie1,serie2],axis=0).set_axis(range(1,len(serie1)+len(serie2)+1))) #Para resetear indices de manera especifica
#CUIDADO: range va hasta el valor n-1

1    A
2    B
3    C
dtype: object
----------------------------------------------------------------------------------------------------
1    D
2    E
3    F
dtype: object
----------------------------------------------------------------------------------------------------
1    A
2    B
3    C
1    D
2    E
3    F
dtype: object
----------------------------------------------------------------------------------------------------
0    A
1    B
2    C
3    D
4    E
5    F
dtype: object
----------------------------------------------------------------------------------------------------
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object


In [5]:
pd.concat([serie1,serie2],axis=1) #Concatenacion horizontal de series: dataframe

Unnamed: 0,0,1
1,A,D
2,B,E
3,C,F


In [6]:
#Funcion para crear DataFrames de manera especifica
def make_df(columnas,indices):
    data={c:[str(c)+str(i) for i in indices] for c in columnas}
    return pd.DataFrame(data,indices)

In [7]:
df1=make_df('AB',[1,2])
df2=make_df('AB',[1,2])

display('df1','df2','pd.concat([df1,df2])') #Por defecto concatenacion vertical. Si las columnas se llaman igual no hay problema

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
1,A1,B1
2,A2,B2


In [8]:
display('df1','df2','pd.concat([df1,df2],axis=1)')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B,A.1,B.1
1,A1,B1,A1,B1
2,A2,B2,A2,B2


In [9]:
#Columnas con nombres diferentes
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3','df4','pd.concat([df3,df4])')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
0,,,C0,D0
1,,,C1,D1


#### Índices duplicados

In [10]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  # Hacer que los indices sean iguales
display('x', 'y', 'pd.concat([x, y], axis = 0)') # resetear indices

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [11]:
#Para verificar que los indices no se superpongan
try:
    pd.concat([x, y], verify_integrity=True) 
except ValueError as e:
    print("ValueError:", e) #Si se superponen nos devuelve esto

ValueError: Indexes have overlapping values: Index([0, 1], dtype='int64')


In [12]:
#Ignorar el indice. ignore_index=True la concatenacion no tiene en cuenta los indices de los dataframes, crea un indice nuevo para el resueltado
display('x','y','pd.concat([x,y],ignore_index=True)')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [13]:
#Claves multiindices (keys)
display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


In [14]:
#Algunas columnas coindicen otras no
#Entradas sin datos se llenaran con NA
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [15]:
display('df5', 'df6',"pd.concat([df5, df6], join='inner',axis=1)") #Por defecto join='outer'
#axis=1: horizontal. filas.
#join='inner'. Muestra las filas sin ningun nan

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,B.1,C.1,D


In [16]:
display('df5', 'df6',"pd.concat([df5, df6], join='inner', axis=0)")
#Muestra columnas sin ningun nan
#axis=0. Vertical. columnas

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


### **pd.merge**


#### Unión uno a uno

In [17]:
#Dataframe con informacion de varios empleados de una empresa
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

#Combinar infor en un unico dataframe: pd.merge(). Parte de la columna que tienen en comun ambos dataframes
display('df1','df2','pd.merge(df1,df2)')

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


#### Unión muchos a uno

In [18]:
#Una de las dos columnas clave contiene entradas duplicadas
df3=pd.merge(df1,df2)
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR',],
                    'supervisor': ['Carly', 'Guido', 'Steve']})

display('df3','df4','pd.merge(df3,df4)')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


#### Unión muchos a muchos

In [19]:
df1=pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                  'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df5=pd.DataFrame({'group':['Accounting','Accounting', 'Engineering', 'Engineering', 'HR', 'HR'],
                  'skills':['math','spreadsheets','coding','linux','spreadsheets','organization']})

display('df1','df5','pd.merge(df1,df5)')

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


#### Palabras clave dentro de pd.merge()

##### ``on``

In [20]:
#Cuando los dos dataframes tienen el nombre de columna esfecificado
#on: donde queremos hacer la union.
display('df1', 'df2', "pd.merge(df1, df2, on='employee')")

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


##### ``left_on`` y ``right_on``

In [21]:
#Fusionar conjuntos con nombres de columna diferentes
#Por ejemplo name y employee guardan los mismos datos, solo que la etiketa (nombre) de la columna es diferente
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
display('df1','df3','pd.merge(df1,df3,left_on="employee",right_on="name")') #Lefton hace referencia al primer dataframe identificado. 

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,120000
3,Sue,HR,Sue,90000


In [22]:
#Como no nos iinteresa tener datos repetidos
pd.merge(df1,df3,left_on='employee',right_on='name').drop('name',axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


##### ``left_index`` y ``right_index``

In [23]:
display('df1','df2','df3')

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [24]:
#A veces es comveniente fusionar Dataframes por indices
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
display('df1a', 'df2a','pd.merge(df1,df2, left_index=True, right_index=True)') #Coje ambos indices y los concatena

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR

Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014

Unnamed: 0,employee_x,group,employee_y,hire_date
0,Bob,Accounting,Lisa,2004
1,Jake,Engineering,Bob,2008
2,Lisa,Engineering,Jake,2012
3,Sue,HR,Sue,2014


In [25]:
#Mezclar indices y columnas
display('df1a','df3','pd.merge(df1a,df3, left_index=True, right_on="name")')

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000

Unnamed: 0,group,name,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,120000
3,HR,Sue,90000


In [26]:
#Poner columnas en un orden en concreto
pd.merge(df1a,df3,left_index=True,right_on='name')[['name','group','salary']] #DOBLE CORCHETE


Unnamed: 0,name,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


##### ``how`` : aritmetica de conjuntos

In [27]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},)
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},)
display('df6', 'df7', 'pd.merge(df6, df7)') #Union. Los datos que estan en ambos conjuntos. Interseccion de los conjuntos

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [28]:
#inner
pd.merge(df6, df7, how='inner') #Interseccion de los conjuntos

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [29]:
display('df6', 'df7', "pd.merge(df6, df7, how='outer')") #Union de todas las columnas de entrada. Devolviendo nan si no hay dato

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [30]:
display('df6', 'df7', "pd.merge(df6, df7, how='left')") #El left le da preferencia al dataframe de la izquierda. El dataframe de la izquierda lo tenemos en el resultado si o si. Despues intentara mapear con el de la derecha

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine


In [31]:
display('df6', 'df7', "pd.merge(df6, df7, how='right')") #En este caso la preferenica la tiene el dataframe de la derecha

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Mary,bread,wine
1,Joseph,,beer


##### ``Suffixes`` : nombres de columnas superpuestas

In [32]:
#Dos dataframes con nombres de columnas en conflicto
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})

display('df8', 'df9', 'pd.merge(df8, df9, on="name")') #Hacemos el cruce con name. Pero tenemos variables que se repiten

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2

Unnamed: 0,name,rank_x,rank_y
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [33]:
#Renombrar
display('df8', 'df9', 'pd.merge(df8, df9, on="name", suffixes=["_df8", "_df9"])')

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2

Unnamed: 0,name,rank_df8,rank_df9
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


## AGREGACIÓN Y AGRUPACIÓN

La libreria seaborn tiene varios DataSets almacenados que pueden ser muy utiles a la hora de practicar. En este caso haremos uso del DataSet de planets. Acontinuacion se muestran todos los Datasets almacenados.

In [3]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [4]:
planetas=sns.load_dataset('planets')
planetas.shape

(1035, 6)

In [5]:
planetas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


In [39]:
planetas.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [56]:
planetas.dropna().describe()
#dropna para que no tenga en cuenta los datos ausentes al hacer los calculos

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


### Agregaciones básicas en Pandas

| Agregación               | Descripción                     |
|--------------------------|---------------------------------|
| ``count()``              | Número total de items           |
| ``first()``, ``last()``  | Primer y último item            |
| ``mean()``, ``median()`` | Media y mediana                 |
| ``min()``, ``max()``     | Mínimo y máximo                 |
| ``std()``, ``var()``     | Desviación estandar y varianza  |
| ``mad()``                | Desviación absoluta media       |
| ``prod()``               | Producto de los items           |
| ``sum()``                | Suma de los items               |


In [6]:
semilla=np.random.RandomState(42)
serie=pd.Series(semilla.rand(5))

print(serie)
print('-'*100)

print(serie.sum()) #Asi funcionan los metodos presentados en la tabla de arriba

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64
----------------------------------------------------------------------------------------------------
2.811925491708157


In [7]:
df=pd.DataFrame({'A':semilla.rand(5), 'B': semilla.rand(5)})
print(df)
print('-'*100)

print(df.mean()) #Por defecto hace las operaciones por columnas. axis=0.
print('-'*100)

print(df.mean(axis='columns')) #Coje los datos por columnas, pero las operaciones se realizan FILA A FILA
print('-'*100)

print(df.mean(axis=1)) #Coje los datos por columnas, pero las operaciones se realizan FILA A FILA


          A         B
0  0.155995  0.020584
1  0.058084  0.969910
2  0.866176  0.832443
3  0.601115  0.212339
4  0.708073  0.181825
----------------------------------------------------------------------------------------------------
A    0.477888
B    0.443420
dtype: float64
----------------------------------------------------------------------------------------------------
0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64
----------------------------------------------------------------------------------------------------
0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64


### GroupBy

- **split**: dividir y agrupar un ``DataFrame`` dependiendo del valor de la clave especificada.
- **apply**: calcular alguna función, generalmente un agregado, una transformación o un filtrado, dentro de los grupos individuales.
- **combine**: fusiona los resultados de estas operaciones en una matriz de salida.


In [8]:
df = pd.DataFrame({'department': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'VV': range(6)})
df

Unnamed: 0,department,VV
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [9]:
df.groupby('department').sum() #Agrupa los valores de department cogiendolos como indice y aplica la suma

#Podemos hacer lo anterior con las agregaciones que queramos. Por ejemplo:
#df.groupby('department').mean()

Unnamed: 0_level_0,VV
department,Unnamed: 1_level_1
A,3
B,5
C,7


In [10]:
df.groupby('department', as_index=False).sum()

Unnamed: 0,department,VV
0,A,3
1,B,5
2,C,7


#### Indexación de columnas


In [11]:
#Volviendo al set de datos de planetas
print(planetas.head())
print('-'*100)

print(planetas['method'].unique()) #Sin repeticiones
print('-'*100)

print(planetas.groupby('method')['orbital_period'].mean()) #Agrupa la columna method y obtniene la media de la de orbital_period partiendo de la agrupacion

            method  number  orbital_period   mass  distance  year
0  Radial Velocity       1         269.300   7.10     77.40  2006
1  Radial Velocity       1         874.774   2.21     56.95  2008
2  Radial Velocity       1         763.000   2.60     19.84  2011
3  Radial Velocity       1         326.030  19.40    110.62  2007
4  Radial Velocity       1         516.220  10.50    119.47  2009
----------------------------------------------------------------------------------------------------
['Radial Velocity' 'Imaging' 'Eclipse Timing Variations' 'Transit'
 'Astrometry' 'Transit Timing Variations' 'Orbital Brightness Modulation'
 'Microlensing' 'Pulsar Timing' 'Pulsation Timing Variations']
----------------------------------------------------------------------------------------------------
method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orb

In [12]:
planetas.groupby('method',as_index=False)[['orbital_period','mass','distance']].mean()

#as_index=False. La variable que coje como indice groupby (en este caso method) lo mantiene como columna en la lista y no como indice
#Por defecto as_index=True

Unnamed: 0,method,orbital_period,mass,distance
0,Astrometry,631.18,,17.875
1,Eclipse Timing Variations,4751.644444,5.125,315.36
2,Imaging,118247.7375,,67.715937
3,Microlensing,3153.571429,,4144.0
4,Orbital Brightness Modulation,0.709307,,1180.0
5,Pulsar Timing,7343.021201,,1200.0
6,Pulsation Timing Variations,1170.0,,
7,Radial Velocity,823.35468,2.630699,51.600208
8,Transit,21.102073,1.47,599.29808
9,Transit Timing Variations,79.7835,,1104.333333


#### Iteración sobre grupos

In [13]:
planetas.groupby('method')['year'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [14]:
planetas.groupby('method')['year'].describe().unstack()

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

#### Aggregation: .aggregate()

Puede tomar una cadena/funcion/lista y calcular todos los agregados que queremos a al vez.

In [17]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'department': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'anio': [2020,2020,2020,2021,2021,2021],
                   'VV': rng.randint(0, 10, 6)},
                   columns = ['department', 'anio', 'VV'])
df

Unnamed: 0,department,anio,VV
0,A,2020,5
1,B,2020,0
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9


In [88]:
df.groupby('department').aggregate(['min','median','max'])

Unnamed: 0_level_0,anio,anio,anio,VV,VV,VV
Unnamed: 0_level_1,min,median,max,min,median,max
department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,2020,2020.5,2021,3,4.0,5
B,2020,2020.5,2021,0,3.5,7
C,2020,2020.5,2021,3,6.0,9


In [40]:
df.groupby('department').aggregate({'anio':['min','median','max']})
#df.groupby('department)['anio'].aggregate(['min','median','max']) es lo mismo

Unnamed: 0_level_0,anio,anio,anio
Unnamed: 0_level_1,min,median,max
department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,2020,2020.5,2021
B,2020,2020.5,2021
C,2020,2020.5,2021


In [84]:
df.groupby('department').aggregate({'anio':'min','VV':'mean'})

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2020,4.0
B,2020,3.5
C,2020,6.0


In [85]:
df.groupby('department').aggregate({'anio':'min','VV':'mean'}).rename(columns={'anio':'anio_min','VV':'VV_mean'})

Unnamed: 0_level_0,anio_min,VV_mean
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2020,4.0
B,2020,3.5
C,2020,6.0


#### Filtering

Permite eliminar datos según las propiedades del grupo.


In [89]:
df

Unnamed: 0,department,anio,VV
0,A,2020,5
1,B,2020,0
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9


In [93]:
print(df['VV']>0) #devuelve booleanos
print('-'*100)

df[df['VV']>0] #Filtrado

0     True
1    False
2     True
3     True
4     True
5     True
Name: VV, dtype: bool
----------------------------------------------------------------------------------------------------


Unnamed: 0,department,anio,VV
0,A,2020,5
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9


#### Transformation


In [95]:
df['ejemplo']=df['VV']-df['VV'].mean()
df

Unnamed: 0,department,anio,VV,ejemplo
0,A,2020,5,0.5
1,B,2020,0,-4.5
2,C,2020,3,-1.5
3,A,2021,3,-1.5
4,B,2021,7,2.5
5,C,2021,9,4.5


In [19]:
df

Unnamed: 0,department,anio,VV
0,A,2020,5
1,B,2020,0
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9


Podemos hacer transformaciones definiendo una funcion general de la transformación deseada y aplicando el metodo .transform()

In [18]:
def funcion(x):
    return x-x.mean() #Siendo x un dataframe aplica lo indicado en todas las columnas

In [22]:
df.groupby('department').transform(funcion) #Aplikamos la funcion definida despues de agrupar por departamentos

Unnamed: 0,anio,VV
0,-0.5,1.0
1,-0.5,-3.5
2,-0.5,-3.0
3,0.5,-1.0
4,0.5,3.5
5,0.5,3.0


In [23]:
df.groupby('department')['VV'].transform(funcion) #Solo aplicamos la transformacion en la columnas VV

0    1.0
1   -3.5
2   -3.0
3   -1.0
4    3.5
5    3.0
Name: VV, dtype: float64

#### The apply() method

Permite aplicar una función a los resultados del grupo.


In [24]:
def norm_by_data(x): 
    x['anio'] /=x['VV'].sum() #La transformacion se hace en la columna de año
    return x

In [25]:
display('df', "df.groupby('department').apply(norm_by_data)")

Unnamed: 0,department,anio,VV
0,A,2020,5
1,B,2020,0
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9

Unnamed: 0_level_0,Unnamed: 1_level_0,department,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0,A,252.5,5
A,3,A,252.625,3
B,1,B,288.571429,0
B,4,B,288.714286,7
C,2,C,168.333333,3
C,5,C,168.416667,9


In [27]:
df.groupby('department').sum() #Se aplica en todos los valores numericos

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4041,8
B,4041,7
C,4041,12


#### Split


Podemos especificar grupos dentro de un dataframe de muchas maneras diferentes.

##### lista/array/serie/indice proporcionando claves de agrupación

In [28]:
clave=[0,1,0,1,2,0] #Cada numero presenta a un grupo.
display('df','df.groupby(clave).sum()')
#Los valores del dataframe que esten en la misma posicion de la clave formaran un grupo

Unnamed: 0,department,anio,VV
0,A,2020,5
1,B,2020,0
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9

Unnamed: 0,department,anio,VV
0,ACC,6061,17
1,BA,4041,3
2,B,2021,7


In [30]:
df.groupby(df['department']).sum()

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4041,8
B,4041,7
C,4041,12


In [31]:
df.groupby('department').sum()

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4041,8
B,4041,7
C,4041,12


##### Ìndice de mapeo

In [32]:
df

Unnamed: 0,department,anio,VV
0,A,2020,5
1,B,2020,0
2,C,2020,3
3,A,2021,3
4,B,2021,7
5,C,2021,9


In [35]:
df2=df.set_index('department')
mapeo={'A': 'grupo_1','B':'grupo_2','C':'grupo_1'} #Agrupación de laa manera que queremos
display('df2','df2.groupby(mapeo).sum()')

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2020,5
B,2020,0
C,2020,3
A,2021,3
B,2021,7
C,2021,9

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
grupo_1,8082,20
grupo_2,4041,7


##### Cualquier función de python

In [36]:
display('df2','df2.groupby(str.lower).mean()') #str.lower se aplica en los indices en este caso: department

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2020,5
B,2020,0
C,2020,3
A,2021,3
B,2021,7
C,2021,9

Unnamed: 0_level_0,anio,VV
department,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2020.5,4.0
b,2020.5,3.5
c,2020.5,6.0


##### Diferentes niveles de agrupación

In [37]:
df2.groupby([str.lower,mapeo]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,anio,VV
department,department,Unnamed: 2_level_1,Unnamed: 3_level_1
a,grupo_1,2020.5,4.0
b,grupo_2,2020.5,3.5
c,grupo_1,2020.5,6.0
