# Proyecto: Manipulación de índices y columnas anidadas.

Veremos cómo extraer nombres de columnas e índices.

Algunas de las funciones epeciales que veremos en este secript son:

`.aggregate()` <-- Permite definir funciones a datos agrupados con 'gruopby()'

`.columns.values` <-- Permite extraer nombres de columnas de un dataframe

`.index.values` <-- Permite extraer nombres de renglons (índices) de un dataframe

`.index.get_level_values(level = ?).values.unique()` <-- permite extraer nombres de índices anidados (correspondientes a un nivel específico)

`.loc[]` <-- Sirve para especificar renglones (índices) al momento de extraer subconjuntos de un dataframe

In [1]:
import pandas as pd

import numpy as np

import seaborn as sns

In [2]:
# importamos datos de seaborn

df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [4]:
# Extraemos el numero de clases de datos en cada variable
df.nunique()

carat        273
cut            5
color          7
clarity        8
depth        184
table        127
price      11602
x            554
y            552
z            375
dtype: int64

In [5]:
# Definimos una función:
def mean_kilo(x):
    return np.mean(x)/1000

## Aplicación de funciones con `.aggregate()`

Cuando aplicamos `.groupby()` a un dataframe, éste va acompañado de una función como: `count()`, `mean()`, `sum()`, etc.,

sin embargo, podemos construir y aplicar funciones más generales usando `.aggregate()` como se muestra a continuación:

Supongamos un dataframe 'df' cuyas columnas son ['c1','c2','c3','c4'].

Si agrupamos con respecto a las columnas ['c1','c2'] y aplicamos un mismo conjunto de funciones `f1(x),...,fn(x)` a las columnas ['c3','c4'], ejecutamos:

`df. groupby(['c1','c2'])[ ['c3','c4'] ].aggregate(f1(x),...,fn(x))`

Si agrupamos con respecto a las columnas ['c1','c2'] y aplicamos un conjunto de funciones `f1(x),...,fn(x)` a la columna ['c3']

y otro conjunto de funciones  `g1(x),...,gk(x)`  a la columna ['c4'], ejecutamos:

`df. groupby(['c1','c2']).aggregate({'c3':[f1(x),...,fn(x)], 'c4':[g1(x),...,gk(x)] })`



In [8]:
# Dataframe original
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [14]:
# Aplicamos un conjunto de funciones a la columna ['price'] y otro conjunto de funciones a la column ['carat']:
df.groupby(['cut','color']) .aggregate({'price':[np.min,np.max] , 'carat':[np.mean,mean_kilo] })

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,carat,carat
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,amax,mean,mean_kilo
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Ideal,D,367,18693,0.565766,0.000566
Ideal,E,326,18729,0.578401,0.000578
Ideal,F,408,18780,0.655829,0.000656
Ideal,G,361,18806,0.700715,0.000701
Ideal,H,357,18760,0.799525,0.0008
Ideal,I,348,18779,0.913029,0.000913
Ideal,J,340,18508,1.063594,0.001064
Premium,D,367,18575,0.721547,0.000722
Premium,E,326,18477,0.717745,0.000718
Premium,F,342,18791,0.827036,0.000827


In [15]:
# Aplicamos un mismo conjunto de funciones a las columnas ['price','carat']:
P = df.groupby(['cut','color'])[ ['price','carat'] ] .aggregate( [np.min, np.mean, np.max, mean_kilo] )
P

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,carat,carat,carat,carat
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,mean,amax,mean_kilo,amin,mean,amax,mean_kilo
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Ideal,D,367,2629.094566,18693,2.629095,0.2,0.565766,2.75,0.000566
Ideal,E,326,2597.55009,18729,2.59755,0.2,0.578401,2.28,0.000578
Ideal,F,408,3374.939362,18780,3.374939,0.23,0.655829,2.45,0.000656
Ideal,G,361,3720.706388,18806,3.720706,0.23,0.700715,2.54,0.000701
Ideal,H,357,3889.334831,18760,3.889335,0.23,0.799525,3.5,0.0008
Ideal,I,348,4451.970377,18779,4.45197,0.23,0.913029,3.22,0.000913
Ideal,J,340,4918.186384,18508,4.918186,0.23,1.063594,3.01,0.001064
Premium,D,367,3631.292576,18575,3.631293,0.2,0.721547,2.57,0.000722
Premium,E,326,3538.91442,18477,3.538914,0.2,0.717745,3.05,0.000718
Premium,F,342,4324.890176,18791,4.32489,0.2,0.827036,3.01,0.000827


## Extracción del nombre de columnas y subcolumnas:

In [16]:
# Extracción del nombre de columnas y subcolumnas
P.columns.values

# Podemos accesar a los valores con:
# P.columns.values[0] #  --> ('price', 'amin')
# P.columns.values[0][1] #  --> 'admin'

array([('price', 'amin'), ('price', 'mean'), ('price', 'amax'),
       ('price', 'mean_kilo'), ('carat', 'amin'), ('carat', 'mean'),
       ('carat', 'amax'), ('carat', 'mean_kilo')], dtype=object)

In [17]:
# Extracción del nombre de subcolumnas de una columna:
P['price'].columns.values

# Obtenemos lo mismo si ejecutamos:
# P.price.columns.values

array(['amin', 'mean', 'amax', 'mean_kilo'], dtype=object)

## Extracción del nombre de índices y multíndices:

In [18]:
# Extracción del nombre de índices y multíndices:
P.index.values

array([('Ideal', 'D'), ('Ideal', 'E'), ('Ideal', 'F'), ('Ideal', 'G'),
       ('Ideal', 'H'), ('Ideal', 'I'), ('Ideal', 'J'), ('Premium', 'D'),
       ('Premium', 'E'), ('Premium', 'F'), ('Premium', 'G'),
       ('Premium', 'H'), ('Premium', 'I'), ('Premium', 'J'),
       ('Very Good', 'D'), ('Very Good', 'E'), ('Very Good', 'F'),
       ('Very Good', 'G'), ('Very Good', 'H'), ('Very Good', 'I'),
       ('Very Good', 'J'), ('Good', 'D'), ('Good', 'E'), ('Good', 'F'),
       ('Good', 'G'), ('Good', 'H'), ('Good', 'I'), ('Good', 'J'),
       ('Fair', 'D'), ('Fair', 'E'), ('Fair', 'F'), ('Fair', 'G'),
       ('Fair', 'H'), ('Fair', 'I'), ('Fair', 'J')], dtype=object)

In [19]:
# Extraemos nombres de los índices correspondientes a un nivel específico:
P.index.get_level_values(level='cut').values

['Ideal', 'Ideal', 'Ideal', 'Ideal', 'Ideal', ..., 'Fair', 'Fair', 'Fair', 'Fair', 'Fair']
Length: 35
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [20]:
# Para evitar repetción de valores, agregamos '.unique()'  
P.index.get_level_values(level='cut').values.unique()

['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [21]:
# Extraemos nombres de los índices correspondientes a un nivel específico:
P.index.get_level_values(level='color').values

['D', 'E', 'F', 'G', 'H', ..., 'F', 'G', 'H', 'I', 'J']
Length: 35
Categories (7, object): ['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [22]:
# Para evitar repetción de valores, agregamos '.unique()'  
P.index.get_level_values(level='color').values.unique()

['D', 'E', 'F', 'G', 'H', 'I', 'J']
Categories (7, object): ['D', 'E', 'F', 'G', 'H', 'I', 'J']

## Extracción de subconjutnos especificacdos por columnas

In [23]:
P['price'].head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,amin,mean,amax,mean_kilo
cut,color,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ideal,D,367,2629.094566,18693,2.629095
Ideal,E,326,2597.55009,18729,2.59755
Ideal,F,408,3374.939362,18780,3.374939
Ideal,G,361,3720.706388,18806,3.720706
Ideal,H,357,3889.334831,18760,3.889335


In [24]:
# Si usamos doble corchete "[[ ]]" se muestra el nombre de la columna superior
P[ ['price'] ].head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,mean,amax,mean_kilo
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Ideal,D,367,2629.094566,18693,2.629095
Ideal,E,326,2597.55009,18729,2.59755
Ideal,F,408,3374.939362,18780,3.374939
Ideal,G,361,3720.706388,18806,3.720706
Ideal,H,357,3889.334831,18760,3.889335


## Extracción de subconjutnos especificacdos por renglones (uso de `.loc[]` )

In [25]:
P.loc['Very Good']

Unnamed: 0_level_0,price,price,price,price,carat,carat,carat,carat
Unnamed: 0_level_1,amin,mean,amax,mean_kilo,amin,mean,amax,mean_kilo
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
D,357,3470.467284,18542,3.470467,0.23,0.696424,2.58,0.000696
E,352,3214.652083,18731,3.214652,0.2,0.676317,2.51,0.000676
F,357,3778.82024,18777,3.77882,0.23,0.740961,2.48,0.000741
G,354,3872.753806,18818,3.872754,0.23,0.766799,2.52,0.000767
H,337,4535.390351,18803,4.53539,0.23,0.915948,3.0,0.000916
I,336,5255.879568,18500,5.25588,0.24,1.046952,4.0,0.001047
J,336,5103.513274,18430,5.103513,0.24,1.133215,2.74,0.001133


In [26]:
# Si usamos doble corchete "[[ ]]" se muestra el nombre del índice superior:
P.loc[['Very Good']]

# Da lo mismo si ejecutamos: 
# P.loc['Very Good',:,:]

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,carat,carat,carat,carat
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,mean,amax,mean_kilo,amin,mean,amax,mean_kilo
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Very Good,D,357,3470.467284,18542,3.470467,0.23,0.696424,2.58,0.000696
Very Good,E,352,3214.652083,18731,3.214652,0.2,0.676317,2.51,0.000676
Very Good,F,357,3778.82024,18777,3.77882,0.23,0.740961,2.48,0.000741
Very Good,G,354,3872.753806,18818,3.872754,0.23,0.766799,2.52,0.000767
Very Good,H,337,4535.390351,18803,4.53539,0.23,0.915948,3.0,0.000916
Very Good,I,336,5255.879568,18500,5.25588,0.24,1.046952,4.0,0.001047
Very Good,J,336,5103.513274,18430,5.103513,0.24,1.133215,2.74,0.001133


In [27]:
# Extraer una fila específica
P.loc[ 'Very Good' , 'F', :]

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,carat,carat,carat,carat
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,mean,amax,mean_kilo,amin,mean,amax,mean_kilo
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Very Good,F,357,3778.82024,18777,3.77882,0.23,0.740961,2.48,0.000741


In [28]:
# Extraer una fila específica y una columna especifica
P.loc[ 'Very Good' , 'F', :]['price']

Unnamed: 0_level_0,Unnamed: 1_level_0,amin,mean,amax,mean_kilo
cut,color,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Very Good,F,357,3778.82024,18777,3.77882


In [29]:
P.loc[ 'Very Good' , 'F', :]['price']['mean_kilo']

cut        color
Very Good  F        3.77882
Name: mean_kilo, dtype: float64