In [1]:
## Importamos las librerías

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
## Leemos el archivo

planetas = sns.load_dataset('planets')
planetas.head(12)

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009
5,Radial Velocity,1,185.84,4.8,76.39,2008
6,Radial Velocity,1,1773.4,4.64,18.15,2002
7,Radial Velocity,1,798.5,,21.41,1996
8,Radial Velocity,1,993.3,10.3,73.1,2008
9,Radial Velocity,2,452.8,1.99,74.79,2010


In [3]:
planetas.shape

(1035, 6)

In [4]:
planetas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


In [5]:
planetas.isnull().sum()*100/planetas.shape[0] ## vemos el porcentaje de datos faltantes

method             0.000000
number             0.000000
orbital_period     4.154589
mass              50.434783
distance          21.932367
year               0.000000
dtype: float64

In [6]:
planetas['method'].unique()  ## vemos los diferentes métodos de detección

array(['Radial Velocity', 'Imaging', 'Eclipse Timing Variations',
       'Transit', 'Astrometry', 'Transit Timing Variations',
       'Orbital Brightness Modulation', 'Microlensing', 'Pulsar Timing',
       'Pulsation Timing Variations'], dtype=object)

In [7]:
planetas['number'].unique()

array([1, 2, 3, 5, 4, 6, 7])

In [8]:
planetas.describe() ## información estadística de variables numéricas

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [9]:
rango = planetas['year'].max()-planetas['year'].min() ## veamos el rango de años de descubrimiento 
rango

25

In [10]:
## Hagamos una operación condicionada

planetas['new-column'] = 0 ## Creamos una nueva columna llena de ceros

In [11]:
planetas.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year,new-column
0,Radial Velocity,1,269.3,7.1,77.4,2006,0
1,Radial Velocity,1,874.774,2.21,56.95,2008,0
2,Radial Velocity,1,763.0,2.6,19.84,2011,0
3,Radial Velocity,1,326.03,19.4,110.62,2007,0
4,Radial Velocity,1,516.22,10.5,119.47,2009,0


In [12]:
planetas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
 6   new-column      1035 non-null   int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 56.7+ KB


In [13]:
cond1 = (planetas['mass'] > 5 ) & (planetas['mass'] < 10) ## Eligimos las masas entre 5 y 10, excluyendo 5 y 10
indice_cond1 = planetas[cond1].index  ## para obtener los índices donde la condición se cumple
planetas.iloc[indice_cond1] = 1

In [14]:
planetas.head(20)

Unnamed: 0,method,number,orbital_period,mass,distance,year,new-column
0,1,1,1.0,1.0,1.0,1,1
1,Radial Velocity,1,874.774,2.21,56.95,2008,0
2,Radial Velocity,1,763.0,2.6,19.84,2011,0
3,Radial Velocity,1,326.03,19.4,110.62,2007,0
4,Radial Velocity,1,516.22,10.5,119.47,2009,0
5,Radial Velocity,1,185.84,4.8,76.39,2008,0
6,Radial Velocity,1,1773.4,4.64,18.15,2002,0
7,Radial Velocity,1,798.5,,21.41,1996,0
8,Radial Velocity,1,993.3,10.3,73.1,2008,0
9,Radial Velocity,2,452.8,1.99,74.79,2010,0


In [15]:
planetas.groupby('method').count()  

Unnamed: 0_level_0,number,orbital_period,mass,distance,year,new-column
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,55,55,55,55,55,55
Astrometry,2,2,0,2,2,2
Eclipse Timing Variations,8,8,1,4,8,8
Imaging,38,12,0,32,38,38
Microlensing,23,7,0,10,23,23
Orbital Brightness Modulation,3,3,0,2,3,3
Pulsar Timing,5,5,0,1,5,5
Pulsation Timing Variations,1,1,0,0,1,1
Radial Velocity,499,499,456,479,499,499
Transit,397,397,1,224,397,397


In [16]:
planetas.groupby('method').distance.aggregate(['min', 'max', 'median', 'mean'])

Unnamed: 0_level_0,min,max,median,mean
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,1.0,1.0,1.0
Astrometry,14.98,20.77,17.875,17.875
Eclipse Timing Variations,130.72,500.0,315.36,315.36
Imaging,7.69,165.0,40.395,67.715937
Microlensing,1760.0,7720.0,3840.0,4144.0
Orbital Brightness Modulation,1180.0,1180.0,1180.0,1180.0
Pulsar Timing,1200.0,1200.0,1200.0,1200.0
Pulsation Timing Variations,,,,
Radial Velocity,1.35,354.0,39.39,49.558372
Transit,38.0,8500.0,341.0,599.29808


In [17]:
## Apliquemos el método transform

copia_planetas = planetas.copy() ## Hacemos una copia de los datos para no dañar el DataFrame original

## Creamos una columna sólo con números aleatorios
copia_planetas['ID'] = np.random.randint(0, 100 , copia_planetas.shape[0]) 

## Creamos dos columnas con los datos de distancia y periodo orbital transformados
copia_planetas['distance_mean'] = copia_planetas.groupby('method')[['distance']].transform('mean')
copia_planetas['orbital_period_mean'] = copia_planetas.groupby('method')[['orbital_period']].transform('mean')

## Transformamos la columna de la masa 
copia_planetas['mass'] = copia_planetas.groupby('method')[['mass']].transform('mean')

copia_planetas.iloc[15:20]

Unnamed: 0,method,number,orbital_period,mass,distance,year,new-column,ID,distance_mean,orbital_period_mean
15,Radial Velocity,3,14002.0,2.091048,14.08,2009,0,94,49.558372,759.585379
16,Radial Velocity,1,4.230785,2.091048,15.36,1995,0,48,49.558372,759.585379
17,Radial Velocity,5,14.651,2.091048,12.53,1996,0,62,49.558372,759.585379
18,Radial Velocity,5,44.38,2.091048,12.53,2004,0,45,49.558372,759.585379
19,Radial Velocity,5,4909.0,2.091048,12.53,2002,0,37,49.558372,759.585379


In [18]:
## Probemos la función filter
## Agrupemos por método y filtremos los que operan a una distancia media mayor a 500

def filtro_func(x):
  return x["distance"].mean() > 500 ## Tomamos los datos de la columna distancia cuya media sea mayor a 500

planetas.groupby('method').filter(filtro_func).method.value_counts()

Transit                          397
Microlensing                      23
Pulsar Timing                      5
Transit Timing Variations          4
Orbital Brightness Modulation      3
Name: method, dtype: int64

In [19]:
planetas.groupby('method').mean().distance

method
1                                   1.000000
Astrometry                         17.875000
Eclipse Timing Variations         315.360000
Imaging                            67.715937
Microlensing                     4144.000000
Orbital Brightness Modulation    1180.000000
Pulsar Timing                    1200.000000
Pulsation Timing Variations              NaN
Radial Velocity                    49.558372
Transit                           599.298080
Transit Timing Variations        1104.333333
Name: distance, dtype: float64

In [20]:
def filtro_ID(x):
    return x['ID'].median() < 50  ## Filtro que nos dará un valor booleano

copia_planetas.groupby('method').filter(filtro_ID).method.value_counts()

## Aplicamos el filtro para el método de dección y contamos para cuántos valores se cumple la condición (True)

Radial Velocity                499
1                               55
Imaging                         38
Microlensing                    23
Eclipse Timing Variations        8
Transit Timing Variations        4
Pulsation Timing Variations      1
Name: method, dtype: int64

In [21]:
planetas['year'].head(10)

0       1
1    2008
2    2011
3    2007
4    2009
5    2008
6    2002
7    1996
8    2008
9    2010
Name: year, dtype: int64

In [22]:
## Mostrar el número de planetas descubiertos por método de detección y por década

decada = (planetas['year']//10)*10   ## Hacemos la separación de las décadas ## // : división entera
decada = decada.astype(str) + 's'    ## Convertimos las décadas en strs y les agregamos la s 
planetas.groupby(['method', decada])['number'].sum()#.unstack().fillna(0)

method                         year 
1                              0s        55
Astrometry                     2010s      2
Eclipse Timing Variations      2000s      4
                               2010s     10
Imaging                        2000s     29
                               2010s     21
Microlensing                   2000s     12
                               2010s     15
Orbital Brightness Modulation  2010s      5
Pulsar Timing                  1990s      9
                               2000s      1
                               2010s      1
Pulsation Timing Variations    2000s      1
Radial Velocity                1980s      1
                               1990s     47
                               2000s    431
                               2010s    408
Transit                        2000s     64
                               2010s    712
Transit Timing Variations      2010s      9
Name: number, dtype: int64

In [23]:
planetas['year'].head()

0       1
1    2008
2    2011
3    2007
4    2009
Name: year, dtype: int64

In [24]:
planetas.method.head()

0                  1
1    Radial Velocity
2    Radial Velocity
3    Radial Velocity
4    Radial Velocity
Name: method, dtype: object

In [25]:
planetas.groupby(['method', decada])['number'].sum().unstack().fillna(0)

year,0s,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,55.0,0.0,0.0,0.0,0.0
Astrometry,0.0,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,0.0,4.0,10.0
Imaging,0.0,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,0.0,1.0,0.0
Radial Velocity,0.0,1.0,47.0,431.0,408.0
Transit,0.0,0.0,0.0,64.0,712.0


In [26]:
## Especifiquemos la separación del DataFrame

In [27]:
# La clave puede ser cualquier serie o lista con una longitud que coincida con la del DataFrame

## Creamos un nuevo DataFrame

df = pd.DataFrame({ 'key' : ['A', 'B' , 'C' , 'A' , 'B' , 'C'] , 'datos1': range(1,7) , 'datos2': np.random.RandomState(0).randint(0, 10, 6)  }, columns= ['key', 'datos1' , 'datos2'])

# Ahora crearemos una lista con la misma longitud de las filas, los datos que sean iguales serán con los que estén
# agrupados los datos

L = ['a' , 1 , 'a' , 1 , 10, 10] 

## Creamos un nuevo DataFrame agrupado de acuerdo a L 

df2 = df.groupby(L)

## Mostramos la forma de agrupar los datos de acuerdo a L

for llave, valor in df2:
    grupo = df2.get_group(llave)
    print (grupo)
    print ('')
    
## Calculemos la media con los nuevos grupos

df2.mean()


  key  datos1  datos2
1   B       2       0
3   A       4       3

  key  datos1  datos2
4   B       5       7
5   C       6       9

  key  datos1  datos2
0   A       1       5
2   C       3       3



Unnamed: 0,datos1,datos2
1,3.0,1.5
10,5.5,8.0
a,2.0,4.0


In [28]:
## Otro método es el de proporcionar un diccionario que asigne los valores de los índices a las claves de grupo

# Asignamos la columna "key" como índice
df3 = df.set_index('key')   
print(df3) 

# Mapeemos los índices
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant2'} 

df4 = df3.groupby(mapping)

#print(df2.groupby(mapping).sum())

     datos1  datos2
key                
A         1       5
B         2       0
C         3       3
A         4       3
B         5       7
C         6       9


In [29]:
for llave, valor in df4:
    grupo = df4.get_group(llave)
    print (grupo)
    print ('')


     datos1  datos2
key                
B         2       0
B         5       7

     datos1  datos2
key                
C         3       3
C         6       9

     datos1  datos2
key                
A         1       5
A         4       3



In [30]:
print(df3.groupby(mapping).sum())

            datos1  datos2
key                       
consonant        7       7
consonant2       9      12
vowel            5       8


In [31]:
# Análogamente al mapeo, es posible pasar cualquier función de Python que ingrese el valor del índice y 
# genere el grupo

df5 = df3.groupby(str.lower).mean()
df5

Unnamed: 0_level_0,datos1,datos2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.5,4.0
b,3.5,3.5
c,4.5,6.0


In [32]:
## Cualquiera de las opciones anteriores se pueden combinar para agrupar con índice múltiple

df6 = df3.groupby([str.lower, mapping]).mean()
df6

Unnamed: 0_level_0,Unnamed: 1_level_0,datos1,datos2
key,key,Unnamed: 2_level_1,Unnamed: 3_level_1
a,vowel,2.5,4.0
b,consonant,3.5,3.5
c,consonant2,4.5,6.0


In [33]:
## Se puede acceder a los diferentes niveles del índice múltiple mediante el argumento "level"

df6.groupby(level=0).mean()

Unnamed: 0_level_0,datos1,datos2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.5,4.0
b,3.5,3.5
c,4.5,6.0


In [34]:
df6.groupby(level=1).mean()

Unnamed: 0_level_0,datos1,datos2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,3.5,3.5
consonant2,4.5,6.0
vowel,2.5,4.0
