### IS727272 - Cordero Hernández, Marco Ricardo

# 11. Manipulación de datos

En Aprendizaje Automático es muy importante manipular las bases de datos. El éxito de un buen aprendizaje radica en la calidad de la base de datos que se tenga. En este documento explicaremos algunas funciones útiles para manipular datos.

In [1]:
import pandas as pd

## 11.1 Unión de dataframes

Crearemos dos dataframes que compartan una columna y después las uniremos utilizando la función $merge()$.

In [2]:
df1 = pd.DataFrame({'c1':['1','2','3'], 'clave': ['a','b','c']})
df1

Unnamed: 0,c1,clave
0,1,a
1,2,b
2,3,c


In [3]:
df2 = pd.DataFrame({'c2': ['4','5','6'], 'clave': ['c','b','e']})
df2

Unnamed: 0,c2,clave
0,4,c
1,5,b
2,6,e


In [4]:
df3 = pd.DataFrame.merge(df1, df2)
df3

Unnamed: 0,c1,clave,c2
0,2,b,5
1,3,c,4


In [5]:
df3 = pd.DataFrame.merge(df1, df2, on = 'clave')
df3

Unnamed: 0,c1,clave,c2
0,2,b,5
1,3,c,4


In [6]:
df3 = pd.DataFrame.merge(df1, df2, on = 'clave', how = 'left')
df3

Unnamed: 0,c1,clave,c2
0,1,a,
1,2,b,5.0
2,3,c,4.0


In [7]:
df3 = pd.DataFrame.merge(df1, df2, on = 'clave', how = 'right')
df3

Unnamed: 0,c1,clave,c2
0,3.0,c,4
1,2.0,b,5
2,,e,6


In [8]:
df3 = pd.DataFrame.merge(df1, df2, on = 'clave', how = 'outer')
df3

Unnamed: 0,c1,clave,c2
0,1.0,a,
1,2.0,b,5.0
2,3.0,c,4.0
3,,e,6.0


## 11.2 Concatenación de datos

A veces es necesario, manipular los datos y guardarlos en un array para ser manipulados posteriormente por los algoritmos de aprendizaje automático.

In [9]:
import numpy as np

### 11.2.1 Concatenar arrays

In [10]:
array1 = np.arange(9).reshape(3, 3)
array1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [11]:
matriz = np.concatenate([array1, array1])
matriz

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [12]:
matriz = np.concatenate([array1, array1], axis=1)
matriz

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

### 11.2.2 Concatenar Series


In [13]:
#Crear dos series
serie1 = pd.Series([1, 2, 3], index = ['a', 'b', 'c'])
serie2 = pd.Series([4, 5, 6], index = ['a', 'b', 'c'])
print(serie1, serie2, sep='\n\n')

a    1
b    2
c    3
dtype: int64

a    4
b    5
c    6
dtype: int64


In [14]:
#Concatenar series
pd.concat([serie1, serie2])

a    1
b    2
c    3
a    4
b    5
c    6
dtype: int64

In [15]:
pd.concat([serie1, serie2], axis = 1)

Unnamed: 0,0,1
a,1,4
b,2,5
c,3,6


### 11.2.3 Concatenar Dataframes

In [16]:
#Crear dataframes
df1 = pd.DataFrame(np.random.rand(3, 3), columns = ['a', 'b', 'c'])
df2 = pd.DataFrame(np.random.rand(3, 3), columns = ['a', 'b', 'c'])
print(df1, df2, sep='\n\n')

          a         b         c
0  0.298309  0.039003  0.652660
1  0.502605  0.055490  0.027191
2  0.283510  0.352335  0.870007

          a         b         c
0  0.506286  0.569339  0.361196
1  0.661528  0.232846  0.375496
2  0.109429  0.496161  0.967532


In [17]:
#Concatenar los dataframes
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,a,b,c
0,0.298309,0.039003,0.65266
1,0.502605,0.05549,0.027191
2,0.28351,0.352335,0.870007
0,0.506286,0.569339,0.361196
1,0.661528,0.232846,0.375496
2,0.109429,0.496161,0.967532


In [18]:
#Concatenar los dataframes ignorando los índices
df3 = pd.concat([df1, df2], ignore_index = True)
df3

Unnamed: 0,a,b,c
0,0.298309,0.039003,0.65266
1,0.502605,0.05549,0.027191
2,0.28351,0.352335,0.870007
3,0.506286,0.569339,0.361196
4,0.661528,0.232846,0.375496
5,0.109429,0.496161,0.967532


## 11.3 Combinar Dataframes

In [19]:
#Crear dos dataframes
df1 = pd.DataFrame([1, 2, np.nan])
df1

Unnamed: 0,0
0,1.0
1,2.0
2,


In [20]:
df2 = pd.DataFrame([4, 5, 6])
df2

Unnamed: 0,0
0,4
1,5
2,6


In [21]:
df3 = df1.combine_first(df2) # Conserva valores de df1; sustituye valores nulos con df2
df3

Unnamed: 0,0
0,1.0
1,2.0
2,6.0


## 11.4 Eliminar duplicados en Dataframes

In [22]:
#Crear dataframe
lista_valores = [[1, 2], [1, 2], [5, 6], [5, 8]]
lista_indices = list('mnop')
lista_columnas = ['valor1', 'valor2']

df = pd.DataFrame(lista_valores, lista_indices, lista_columnas)
df

Unnamed: 0,valor1,valor2
m,1,2
n,1,2
o,5,6
p,5,8


In [23]:
#Eliminar filas duplicadas
df2 = df.drop_duplicates()
df2

Unnamed: 0,valor1,valor2
m,1,2
o,5,6
p,5,8


In [24]:
#Eliminar duplicados por columna
df2 = df.drop_duplicates(['valor1'])
df2

Unnamed: 0,valor1,valor2
m,1,2
o,5,6


In [25]:
#Eliminar duplicados por columna pero manteniendo el último valor
df2 = df.drop_duplicates(['valor1'], keep = 'last')
df2

Unnamed: 0,valor1,valor2
n,1,2
p,5,8


## 11.5 Agrupar datos en categorías


In [26]:
precios = [42, 55, 48, 23, 5, 21, 88, 34, 26]
rango = [i for i in range(10, 110, 10)]

precios_con_rango = pd.cut(precios, rango)
print(precios_con_rango)

[(40.0, 50.0], (50.0, 60.0], (40.0, 50.0], (20.0, 30.0], NaN, (20.0, 30.0], (80.0, 90.0], (30.0, 40.0], (20.0, 30.0]]
Categories (9, interval[int64, right]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]


In [27]:
pd.value_counts(precios_con_rango)

(20, 30]     3
(40, 50]     2
(30, 40]     1
(50, 60]     1
(80, 90]     1
(10, 20]     0
(60, 70]     0
(70, 80]     0
(90, 100]    0
dtype: int64

## 11.6 Filtrar datos en DataFrame

In [28]:
#Crear dataframe
df = pd.DataFrame(np.random.rand(10, 3))
df

Unnamed: 0,0,1,2
0,0.221522,0.810347,0.954294
1,0.870314,0.534296,0.583741
2,0.114781,0.503904,0.131772
3,0.411645,0.97095,0.955473
4,0.209041,0.959609,0.195256
5,0.538791,0.439572,0.102843
6,0.81779,0.266869,0.016774
7,0.43326,0.18717,0.429774
8,0.582466,0.398709,0.059569
9,0.721351,0.228843,0.59092


In [29]:
col1 = df[0]
col1

0    0.221522
1    0.870314
2    0.114781
3    0.411645
4    0.209041
5    0.538791
6    0.817790
7    0.433260
8    0.582466
9    0.721351
Name: 0, dtype: float64

In [30]:
col1[col1 > 0.4]

1    0.870314
3    0.411645
5    0.538791
6    0.817790
7    0.433260
8    0.582466
9    0.721351
Name: 0, dtype: float64

In [31]:
df[df[0] > 0.4]

Unnamed: 0,0,1,2
1,0.870314,0.534296,0.583741
3,0.411645,0.97095,0.955473
5,0.538791,0.439572,0.102843
6,0.81779,0.266869,0.016774
7,0.43326,0.18717,0.429774
8,0.582466,0.398709,0.059569
9,0.721351,0.228843,0.59092


In [32]:
df[df[1] > 0.6]

Unnamed: 0,0,1,2
0,0.221522,0.810347,0.954294
3,0.411645,0.97095,0.955473
4,0.209041,0.959609,0.195256


## 11.7 Combinar Dataframes

In [33]:
# Crear el dataframe
df = pd.DataFrame(np.arange(25).reshape(5, 5))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [34]:
#Generar una combinación aleatoria del 0 al 4 (5-1)
rd_cmb = np.random.permutation(5)
rd_cmb

array([2, 0, 1, 4, 3])

In [35]:
df.take(rd_cmb)

Unnamed: 0,0,1,2,3,4
2,10,11,12,13,14
0,0,1,2,3,4
1,5,6,7,8,9
4,20,21,22,23,24
3,15,16,17,18,19


## 11.8 Agrupar Dataframes

In [36]:
#Crear el dataframe


In [37]:
#Agrupar los datos de la columna 'datos1' utilizando la columna 'clave1'



In [38]:
#Calcular el promedio de cada grupo


## 11.9 Agregar en Dataframes

In [39]:
#Crear dataframe
lista_valores = [[1, 2, 3], [4, 5, 6], [7, 6, 8], [np.nan, np.nan, np.nan]]
lista_columnas = list('abc')
df = pd.DataFrame(lista_valores, columns = lista_columnas)
df

Unnamed: 0,a,b,c
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,7.0,6.0,8.0
3,,,


In [40]:
df.agg(['sum', 'min', 'mean'])

Unnamed: 0,a,b,c
sum,12.0,13.0,17.0
min,1.0,2.0,3.0
mean,4.0,4.333333,5.666667


In [41]:
df.agg(['sum', 'min', 'mean'], axis = 1)

Unnamed: 0,sum,min,mean
0,6.0,1.0,2.0
1,15.0,4.0,5.0
2,21.0,6.0,7.0
3,0.0,,
