# DataFrames I

In [9]:
import pandas as pd
import os
import requests
import io

* Un **DataFrame** es una tabla 2-dimensional, es decir, compuesta de filas y columnas.
* Pandas utiliza la designación `Nan` (not a number) por defecto para celdas que tengan valores vacíos. Uno tiene que tener cuidado cuando se realicen operaciones que incluyan este tipo de valores.
* Similar a **Series**, Pandas asigna un índice/etiqueta para cada fila del DataFrame.
* Hay métodos en común entre Series y DataFrames, pero también hay métodos/atributos exclusivos de cada objeto. Un método puede ser llamado de la misma forma, pero regresar diferentes cosas dependiendo a que lo apliques. Por ejemplo, el atributo de `columns` solo aparece en DataFrames.


## Importación de datos

Los datos son obtenidos de [GitHub](https://github.com/joanby) (Juan Gabriel Gomila)

In [5]:
mainpath = "../Datasets/"
filename = "titanic3.csv"
fullpath = os.path.join(mainpath,filename)

urldata = "https://raw.githubusercontent.com/joanby/python-ml-course/master/datasets/titanic/titanic3.csv"
# Aqui se puede hacer por fichero local o por internet

In [11]:
#s = requests.get(urldata).content

#titanic_df = pd.read_csv(io.StringIO(s.decode('utf-8')))
#titanic_df.head(5)

In [14]:
titanic_df = pd.read_csv(fullpath)
titanic_df.head(5) # devuelve las primeras filas del dataframe

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [15]:
type(titanic_df)

pandas.core.frame.DataFrame

In [18]:
titanic_df.index # devuelve los indices asociados a las columnas (por default es un rango de numeros enteros)

RangeIndex(start=0, stop=1309, step=1)

In [20]:
titanic_df.values # regresa arrays por cada fila

array([[1, 1, 'Allen, Miss. Elisabeth Walton', ..., '2', nan,
        'St Louis, MO'],
       [1, 1, 'Allison, Master. Hudson Trevor', ..., '11', nan,
        'Montreal, PQ / Chesterville, ON'],
       [1, 0, 'Allison, Miss. Helen Loraine', ..., nan, nan,
        'Montreal, PQ / Chesterville, ON'],
       ...,
       [3, 0, 'Zakarian, Mr. Mapriededer', ..., nan, 304.0, nan],
       [3, 0, 'Zakarian, Mr. Ortin', ..., nan, nan, nan],
       [3, 0, 'Zimmerman, Mr. Leo', ..., nan, nan, nan]], dtype=object)

In [22]:
titanic_df.shape # dimensiones de la tabla (filas, columnas)

(1309, 14)

In [24]:
titanic_df.dtypes # tipo de objeto asociado a cada columna/variable

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [25]:
# ejemplo, referirnos a una sola columna nos regresa un objeto de tipo series

names = titanic_df['name']

In [26]:
type(names)

pandas.core.series.Series

In [28]:
names.hasnans # True si contiene valores nan

False

In [31]:
titanic_df.columns # regresa las columnas del dataframe

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [33]:
titanic_df.axes # la estructura que contiene tanto los indices de filas como las columnas

[RangeIndex(start=0, stop=1309, step=1),
 Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
        'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
       dtype='object')]

In [34]:
titanic_df.info() # regresa un resumen del objeto de pandas dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


## Algunas diferencias entre métodos

In [36]:
titanic_df_filtered = titanic_df[['age','fare']]
titanic_df_filtered.index = titanic_df['name']
titanic_df_filtered

Unnamed: 0_level_0,age,fare
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss. Elisabeth Walton",29.0000,211.3375
"Allison, Master. Hudson Trevor",0.9167,151.5500
"Allison, Miss. Helen Loraine",2.0000,151.5500
"Allison, Mr. Hudson Joshua Creighton",30.0000,151.5500
"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0000,151.5500
...,...,...
"Zabour, Miss. Hileni",14.5000,14.4542
"Zabour, Miss. Thamine",,14.4542
"Zakarian, Mr. Mapriededer",26.5000,7.2250
"Zakarian, Mr. Ortin",27.0000,7.2250


In [38]:
titanic_df_filtered.sum(axis='index')

age     31255.6667
fare    43550.4869
dtype: float64

In [39]:
titanic_df_filtered.sum(axis='columns')

name
Allen, Miss. Elisabeth Walton                      240.3375
Allison, Master. Hudson Trevor                     152.4667
Allison, Miss. Helen Loraine                       153.5500
Allison, Mr. Hudson Joshua Creighton               181.5500
Allison, Mrs. Hudson J C (Bessie Waldo Daniels)    176.5500
                                                     ...   
Zabour, Miss. Hileni                                28.9542
Zabour, Miss. Thamine                               14.4542
Zakarian, Mr. Mapriededer                           33.7250
Zakarian, Mr. Ortin                                 34.2250
Zimmerman, Mr. Leo                                  36.8750
Length: 1309, dtype: float64

Mientras que el método `sum`suma los valores de una serie, en un dataframe por default suma los valores por indice (por fila). El parámetro **axis** nos permite dar la direccion de adición (columns=1, rows=0)

## Seleccionar una columna del dataframe
* Podemos utilizar el atributo (`nombre_df.nombre_columna`) para seleccionar una columna del dataframe. Esta sintaxis no funcionará si el nombre de la columna incluye un espacio.
* También se puede utilizar la sintaxis `nombre_df["nombre_columna"]`, que si funciona incluso si hay espacios.
* Pandas extrae una columna de un **DataFrame** como un objeto de tipo **Series**
* Un cambio en la Serie **afecta** al DataFrame original.
* Pandas pone una advertencia si quieres modificar una Serie. En estos casos lo mejor es hacer una copia con `copy`y trabajar con ella.

In [42]:
mainpath = "../Datasets/"
filename = "titanic3.csv"
fullpath = os.path.join(mainpath,filename)

titanic_df = pd.read_csv(fullpath)
titanic_df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [47]:
names = titanic_df.name
names

0                         Allen, Miss. Elisabeth Walton
1                        Allison, Master. Hudson Trevor
2                          Allison, Miss. Helen Loraine
3                  Allison, Mr. Hudson Joshua Creighton
4       Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
                             ...                       
1304                               Zabour, Miss. Hileni
1305                              Zabour, Miss. Thamine
1306                          Zakarian, Mr. Mapriededer
1307                                Zakarian, Mr. Ortin
1308                                 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: object

In [48]:
type(names)

pandas.core.series.Series

In [49]:
ages = titanic_df.age
ages

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
         ...   
1304    14.5000
1305        NaN
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1309, dtype: float64

In [50]:
names.iloc[0]

'Allen, Miss. Elisabeth Walton'

In [54]:
names = titanic_df["name"]
names

0                         Allen, Miss. Elisabeth Walton
1                        Allison, Master. Hudson Trevor
2                          Allison, Miss. Helen Loraine
3                  Allison, Mr. Hudson Joshua Creighton
4       Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
                             ...                       
1304                               Zabour, Miss. Hileni
1305                              Zabour, Miss. Thamine
1306                          Zakarian, Mr. Mapriededer
1307                                Zakarian, Mr. Ortin
1308                                 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: object

In [55]:
print(titanic_df["name"].iloc[0])

Allen, Miss. Elisabeth Walton


## Seleccionar múltiples columnas de un DataFrame
* Usar la sintaxis con bracket [] para seleccionar columnas. Dentro de estos brackets introducir una lista con las etiquetas de las columnas que uno va a extraer.
* A diferencia de lo anterior, aquí el objeto que regresa tiene más de una columna, por lo que sería un objeto tipo DataFrame.

In [57]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv')
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [58]:
titanic_df[['name','sex','age']]

Unnamed: 0,name,sex,age
0,"Allen, Miss. Elisabeth Walton",female,29.0000
1,"Allison, Master. Hudson Trevor",male,0.9167
2,"Allison, Miss. Helen Loraine",female,2.0000
3,"Allison, Mr. Hudson Joshua Creighton",male,30.0000
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000
...,...,...,...
1304,"Zabour, Miss. Hileni",female,14.5000
1305,"Zabour, Miss. Thamine",female,
1306,"Zakarian, Mr. Mapriededer",male,26.5000
1307,"Zakarian, Mr. Ortin",male,27.0000


In [59]:
# puedo guardar las columnas que quiero en una variable y utilizarla despues
columns_to_select = ['name','sex','age']

titanic_df[columns_to_select]

Unnamed: 0,name,sex,age
0,"Allen, Miss. Elisabeth Walton",female,29.0000
1,"Allison, Master. Hudson Trevor",male,0.9167
2,"Allison, Miss. Helen Loraine",female,2.0000
3,"Allison, Mr. Hudson Joshua Creighton",male,30.0000
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000
...,...,...,...
1304,"Zabour, Miss. Hileni",female,14.5000
1305,"Zabour, Miss. Thamine",female,
1306,"Zakarian, Mr. Mapriededer",male,26.5000
1307,"Zakarian, Mr. Ortin",male,27.0000


## Añadir nueva columna a un DataFrame

* Podemos utilizar la misma sintaxis para referirnos a una columna por brackets [], pero con la diferencia que la etiqueta a utilizar es nueva o no se encuentra en el DF original.

In [71]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv')
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [72]:
titanic_df['Destino'] = 'New York'

In [73]:
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,Destino
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",New York
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",New York
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",New York
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",New York
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",New York
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,,New York
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,,New York
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,,New York
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,,New York


In [74]:
# también podemos utilizar una columna existente y aplicar una operación en ella para crear una nueva Serie.

titanic_df['age2'] = titanic_df['age']*2

In [75]:
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,Destino,age2
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",New York,58.0000
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",New York,1.8334
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",New York,4.0000
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",New York,60.0000
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",New York,50.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,,New York,29.0000
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,,New York,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,,New York,53.0000
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,,New York,54.0000


## Recordatorio del método `value_counts`

In [76]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv')
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [77]:
titanic_df.value_counts()

Series([], dtype: int64)

In [78]:
titanic_df['home.dest'].value_counts()

New York, NY                                    64
London                                          14
Montreal, PQ                                    10
Paris, France                                    9
Cornwall / Akron, OH                             9
                                                ..
Chelsea, London                                  1
Harrow-on-the-Hill, Middlesex                    1
Copenhagen, Denmark                              1
Guernsey / Montclair, NJ and/or Toledo, Ohio     1
Antwerp, Belgium / Stanton, OH                   1
Name: home.dest, Length: 369, dtype: int64

## Eliminar filas con valores nulos
* Pandas utiliza la designación **NaN** para celdas que contengan un valor nulo.
* El método `dropna` es utilizado para eliminar columnas que incluya estos valores en alguna de sus columnas (esto es configurable).
* El parámetro **how** nos permite modificar esto.
* El parámetro **subset** nos permite poner las columnas que quiero utilizar para eliminar por valores nulos.
* Si asignamos el DataFrame de salida, este sale como una copia. Una modificación de este no afecta al original.

In [82]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv')
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [84]:
titanic_df.dropna()
# todas las observaciones tienen al menos un valor nulo
# en este caso si está vivo se reporta boat y si está muerto se reporta body

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [85]:
titanic_df.dropna(how='any') # elimina una fila si alguna de sus columnas es nan

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [87]:
 titanic_df.dropna(how='all') # elimina la fila sólo si todas sus columnas son nan

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [88]:
titanic_df.dropna(subset=['boat']) # unicamente hace la eliminacion basándose en ciertas columnas

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0000,0,0,4138,9.8417,,S,15,,
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0000,0,0,4134,9.5875,,S,15,,
1277,3,1,"Vartanian, Mr. David",male,22.0000,0,0,2658,7.2250,,C,13 15,,
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0000,0,0,2688,7.2292,,C,C,,


In [89]:
titanic_df.dropna(subset=['boat','age']) # unicamente hace la eliminacion basándose en ciertas columnas

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0000,0,0,4138,9.8417,,S,15,,
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0000,0,0,4134,9.5875,,S,15,,
1277,3,1,"Vartanian, Mr. David",male,22.0000,0,0,2658,7.2250,,C,13 15,,
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0000,0,0,2688,7.2292,,C,C,,


In [91]:
titanic_df_drop = titanic_df.dropna(subset=['boat','age'])
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [92]:
titanic_df_drop

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0000,0,0,4138,9.8417,,S,15,,
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0000,0,0,4134,9.5875,,S,15,,
1277,3,1,"Vartanian, Mr. David",male,22.0000,0,0,2658,7.2250,,C,13 15,,
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0000,0,0,2688,7.2292,,C,C,,


## Rellenar valores nulos con método `fillna`
* El método `fillna` remplaza los valores nulos por un argumento que elegimos.
* Este método está disponible tanto para **DataFrame** como **Series**
* De igual manera, una serie o dataframe asignado por este metodo es una copia, y no afecta al original.

In [100]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv').dropna(subset=['boat'])
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0000,0,0,4138,9.8417,,S,15,,
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0000,0,0,4134,9.5875,,S,15,,
1277,3,1,"Vartanian, Mr. David",male,22.0000,0,0,2658,7.2250,,C,13 15,,
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0000,0,0,2688,7.2292,,C,C,,


In [101]:
titanic_df.fillna('Unknown') # si no se especifica algo, lo reemplaza para todo

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,Unknown,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,Unknown,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S,3,Unknown,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,Unknown,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,Unknown,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0,0,0,4138,9.8417,Unknown,S,15,Unknown,Unknown
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,4134,9.5875,Unknown,S,15,Unknown,Unknown
1277,3,1,"Vartanian, Mr. David",male,22.0,0,0,2658,7.2250,Unknown,C,13 15,Unknown,Unknown
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0,0,0,2688,7.2292,Unknown,C,C,Unknown,Unknown


In [102]:
# Es más util cuando nos referimos a columnas en particular
# Reescribimos una columna completa

titanic_df['body'] = titanic_df['body'].fillna('Unknown')

In [103]:
titanic_df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,Unknown,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,Unknown,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,Unknown,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,Unknown,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,Unknown,"Bayside, Queens, NY"


Una práctica común es reemplazar valores nulos por la media aritmética de la columna de la cual se trata.

In [104]:
titanic_df['age'] = titanic_df['age'].fillna(titanic_df['age'].mean())

In [105]:
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,Unknown,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,Unknown,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,Unknown,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,Unknown,"Hudson, NY"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,Unknown,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0000,0,0,4138,9.8417,,S,15,Unknown,
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0000,0,0,4134,9.5875,,S,15,Unknown,
1277,3,1,"Vartanian, Mr. David",male,22.0000,0,0,2658,7.2250,,C,13 15,Unknown,
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0000,0,0,2688,7.2292,,C,C,Unknown,


## Método astype
* Convierte los valores de una serie a un tipo específico de objeto.
* Si hay valores nulos nan esto marcará error.


In [109]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv').dropna(subset=['age'])
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3,0,"Youseff, Mr. Gerious",male,45.5000,0,0,2628,7.2250,,C,,312.0,
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [110]:
titanic_df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [111]:
# por ejemplo, la variable age podría ser entera

titanic_df['age'].astype(int)

0       29
1        0
2        2
3       30
4       25
        ..
1301    45
1304    14
1306    26
1307    27
1308    29
Name: age, Length: 1046, dtype: int64

In [113]:
# esto no modifica el dataframe original, se tiene que reescribir
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3,0,"Youseff, Mr. Gerious",male,45.5000,0,0,2628,7.2250,,C,,312.0,
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [115]:
titanic_df['age'] = titanic_df['age'].astype("int")
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3,0,"Youseff, Mr. Gerious",male,45,0,0,2628,7.2250,,C,,312.0,
1304,3,0,"Zabour, Miss. Hileni",female,14,1,0,2665,14.4542,,C,,328.0,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.2250,,C,,,


El tipo de variable en pandas **category** es ideal para columnas con un número limitado de valores únicos. En este ejemplo el número de boat podría actuar como una etiqueta.

## Método unique y nunique

In [116]:
titanic_df['boat'].unique() # regresa un array con los valore únicos que encuentre.

array(['2', '11', nan, '3', '10', 'D', '4', '9', '6', 'B', '8', 'A', '5',
       '7', 'C', '14', '5 9', '13', '1', '15', '5 7', '8 10', '12', '16',
       '13 15 B', 'C D', '13 15'], dtype=object)

In [118]:
titanic_df['boat'].nunique() # regresa simplemente el número de valores únicos que encuentre

26

In [119]:
titanic_df.nunique()

pclass          3
survived        2
name         1044
sex             2
age            73
sibsp           7
parch           7
ticket        732
fare          256
cabin         174
embarked        3
boat           26
body          120
home.dest     348
dtype: int64

In [120]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1046 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1046 non-null   int64  
 1   survived   1046 non-null   int64  
 2   name       1046 non-null   object 
 3   sex        1046 non-null   object 
 4   age        1046 non-null   int64  
 5   sibsp      1046 non-null   int64  
 6   parch      1046 non-null   int64  
 7   ticket     1046 non-null   object 
 8   fare       1045 non-null   float64
 9   cabin      272 non-null    object 
 10  embarked   1044 non-null   object 
 11  boat       417 non-null    object 
 12  body       120 non-null    float64
 13  home.dest  685 non-null    object 
dtypes: float64(2), int64(5), object(7)
memory usage: 122.6+ KB


## Método sort_values
* Ordena el dataframe por los valores que hay en una o más columnas. Por default se tiene un orden ascendente (o alfabético en el caso de strings).
* Se tiene que especificar el parámetro **by** para definir cual va a ser la columna a tomar en cuenta en el ordenamiento.
* Si se especifican varias columnas, la ordenación irá de acuerdo al orden de éstas.

In [121]:
titanic_df = pd.read_csv('../Datasets/titanic3.csv').dropna(subset=['age'])
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3,0,"Youseff, Mr. Gerious",male,45.5000,0,0,2628,7.2250,,C,,312.0,
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [125]:
titanic_df.sort_values(by='age')
# notar que los índices no fueron reescritos

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
763,3,1,"Dean, Miss. Elizabeth Gladys ""Millvina""",female,0.1667,1,2,C.A. 2315,20.5750,,S,10,,"Devon, England Wichita, KS"
747,3,0,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.3333,0,2,347080,14.4000,,S,,,"Stanton, IA"
1240,3,1,"Thomas, Master. Assad Alexander",male,0.4167,0,1,2625,8.5167,,C,16,,
427,2,1,"Hamalainen, Master. Viljo",male,0.6667,1,1,250649,14.5000,,S,4,,"Detroit, MI"
1111,3,0,"Peacock, Master. Alfred Edward",male,0.7500,1,1,SOTON/O.Q. 3101315,13.7750,,S,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,1,0,"Goldschmidt, Mr. George B",male,71.0000,0,0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
1235,3,0,"Svensson, Mr. Johan",male,74.0000,0,0,347060,7.7750,,S,,,
61,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0000,1,0,19877,78.8500,C46,S,6,,"Little Onn Hall, Staffs"


In [126]:
titanic_df.sort_values(by='age', ascending=False)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
14,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0000,0,0,27042,30.0000,A23,S,B,,"Hessle, Yorks"
61,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0000,1,0,19877,78.8500,C46,S,6,,"Little Onn Hall, Staffs"
1235,3,0,"Svensson, Mr. Johan",male,74.0000,0,0,347060,7.7750,,S,,,
135,1,0,"Goldschmidt, Mr. George B",male,71.0000,0,0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,3,1,"Baclini, Miss. Eugenie",female,0.7500,2,1,2666,19.2583,,C,C,,"Syria New York, NY"
427,2,1,"Hamalainen, Master. Viljo",male,0.6667,1,1,250649,14.5000,,S,4,,"Detroit, MI"
1240,3,1,"Thomas, Master. Assad Alexander",male,0.4167,0,1,2625,8.5167,,C,16,,
747,3,0,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.3333,0,2,347080,14.4000,,S,,,"Stanton, IA"


In [129]:
titanic_df.sort_values(by='age', ascending=False, na_position='last') # el parámetro na_position nos menciona donde colocar los valores nulos que encuentre

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
14,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0000,0,0,27042,30.0000,A23,S,B,,"Hessle, Yorks"
61,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0000,1,0,19877,78.8500,C46,S,6,,"Little Onn Hall, Staffs"
1235,3,0,"Svensson, Mr. Johan",male,74.0000,0,0,347060,7.7750,,S,,,
135,1,0,"Goldschmidt, Mr. George B",male,71.0000,0,0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,3,1,"Baclini, Miss. Eugenie",female,0.7500,2,1,2666,19.2583,,C,C,,"Syria New York, NY"
427,2,1,"Hamalainen, Master. Viljo",male,0.6667,1,1,250649,14.5000,,S,4,,"Detroit, MI"
1240,3,1,"Thomas, Master. Assad Alexander",male,0.4167,0,1,2625,8.5167,,C,16,,
747,3,0,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.3333,0,2,347080,14.4000,,S,,,"Stanton, IA"


In [131]:
titanic_df.sort_values(by=['boat', 'age'], ascending=[True,False], na_position='last') 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
280,1,1,"Stengel, Mr. Charles Emil Henry",male,54.0000,1,0,11778,55.4417,C116,C,1,,"Newark, NJ"
100,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49.0000,1,0,PC 17485,56.9292,A20,C,1,,London / Paris
99,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",female,48.0000,1,0,11755,39.6000,A16,C,1,,London / Paris
117,1,1,"Francatelli, Miss. Laura Mabel",female,30.0000,0,0,PC 17485,56.9292,E36,C,1,,
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,3,0,"Goodwin, Master. Sidney Leonard",male,1.0000,5,2,CA 2144,46.9000,,S,,,"Wiltshire, England Niagara Falls, NY"
937,3,0,"Klasen, Miss. Gertrud Emilia",female,1.0000,1,1,350405,12.1833,,S,,,
1101,3,0,"Panula, Master. Eino Viljami",male,1.0000,4,1,3101295,39.6875,,S,,,
1111,3,0,"Peacock, Master. Alfred Edward",male,0.7500,1,1,SOTON/O.Q. 3101315,13.7750,,S,,,


## Método sort_index

In [133]:
titanic_df.sort_index(ascending=False) # este método utiliza el índice para ordenar.

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1308,3,0,"Zimmerman, Mr. Leo",male,29.0000,0,0,315082,7.8750,,S,,,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1301,3,0,"Youseff, Mr. Gerious",male,45.5000,0,0,2628,7.2250,,C,,312.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


## Método rank

* Asigna un ranking numérico a los valores de una serie/columna.
* Valores iguales tendrán mismo ranking, mientras que el "gap" entre los siguientes será tomado en cuenta.

In [134]:
titanic_df['age'].rank(ascending=True)

0       554.5
1        11.5
2        28.5
3       589.5
4       426.5
        ...  
1301    892.5
1304    108.5
1306    474.0
1307    489.5
1308    554.5
Name: age, Length: 1046, dtype: float64

In [135]:
# se puede agregar al dataframe original
titanic_df['age_rank'] = titanic_df['age'].rank(ascending=True).astype(int)
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age_rank
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",554
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",11
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",28
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",589
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3,0,"Youseff, Mr. Gerious",male,45.5000,0,0,2628,7.2250,,C,,312.0,,892
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,,108
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,,474
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,,489
