# PANDAS - ESTRUCTURAS DE DATOS




---


### Crear una Serie


---



**A partir de un arreglo de Numpy**

In [1]:
import numpy as np
import pandas as pd

#Sin indicar los índices
s = pd.Series(np.random.randn(5))
print(s)
print(s.index)

0    0.497014
1   -0.027115
2    0.377139
3    0.272720
4    1.806928
dtype: float64
RangeIndex(start=0, stop=5, step=1)


In [2]:
#Indicando los índices

s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(s.index)

a   -1.236403
b   -0.605476
c   -3.346858
d   -1.153626
e   -1.045281
dtype: float64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


**A partir de un diccionario**

In [3]:
d = {'b' : 1, 'a' : 0, 'c' : 2}
s1 = pd.Series(d)
print(s1)
print(s1.index)


b    1
a    0
c    2
dtype: int64
Index(['b', 'a', 'c'], dtype='object')


**A partir de un escalar**

In [4]:
s2 = pd.Series(8., index=['a', 'b', 'c', 'd', 'e'])
print(s2)

a    8.0
b    8.0
c    8.0
d    8.0
e    8.0
dtype: float64




---

### Operando Series como arreglos de Numpy


---



In [5]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print("s=\n",s)
print(s[1])
print(s[:3])
print(s[[4, 3, 1]])

s=
 a    1.159953
b   -1.098942
c   -0.354343
d    0.559005
e    0.730893
dtype: float64
-1.0989417232820933
a    1.159953
b   -1.098942
c   -0.354343
dtype: float64
e    0.730893
d    0.559005
b   -1.098942
dtype: float64


In [6]:
print("s=\n",s)
print(s + s)
print(s*2)
print(np.exp(s))
print(np.sin(s))

s=
 a    1.159953
b   -1.098942
c   -0.354343
d    0.559005
e    0.730893
dtype: float64
a    2.319906
b   -2.197883
c   -0.708685
d    1.118010
e    1.461785
dtype: float64
a    2.319906
b   -2.197883
c   -0.708685
d    1.118010
e    1.461785
dtype: float64
a    3.189783
b    0.333224
c    0.701635
d    1.748931
e    2.076934
dtype: float64
a    0.916784
b   -0.890727
c   -0.346974
d    0.530343
e    0.667534
dtype: float64


In [7]:
print(s.median())
#Uso para filtrar datos (tema que veremos más adelante)
print(s[s > s.median()])

0.5590047968379082
a    1.159953
e    0.730893
dtype: float64


In [8]:
print(s.append(pd.Series(7, index=['f'])))

a    1.159953
b   -1.098942
c   -0.354343
d    0.559005
e    0.730893
f    7.000000
dtype: float64




---
### Operando Series como Diccionarios


---




In [9]:
print("s=\n",s)
print(s['a'])
print('a' in s)
print('f' in s)
s['g']=50
print("s=\n",s)


s=
 a    1.159953
b   -1.098942
c   -0.354343
d    0.559005
e    0.730893
dtype: float64
1.159952800323188
True
False
s=
 a     1.159953
b    -1.098942
c    -0.354343
d     0.559005
e     0.730893
g    50.000000
dtype: float64




---
### Salvando a un csv


---




In [10]:
s.to_csv('serie.csv')



---


### Crear un DataFrame a partir de un diccionario


---



In [11]:
d = {'Código': [20152300120, 20153300123, 20172400322, 20172400436], 
'Nota1': [3.3, 4.1, 1.5, 2.0], 'Nota2': [2.1, 3.8, 3.5, 3.6], 'Nota3': [3.3, 4.1, 1.5, 4.1] }
df = pd.DataFrame(data=d)
print(df)

        Código  Nota1  Nota2  Nota3
0  20152300120    3.3    2.1    3.3
1  20153300123    4.1    3.8    4.1
2  20172400322    1.5    3.5    1.5
3  20172400436    2.0    3.6    4.1


**Estableciendo índices**

In [12]:
df = df.set_index('Código')
print(df)

             Nota1  Nota2  Nota3
Código                          
20152300120    3.3    2.1    3.3
20153300123    4.1    3.8    4.1
20172400322    1.5    3.5    1.5
20172400436    2.0    3.6    4.1


In [13]:
print(df.index)
print(df.columns)

Int64Index([20152300120, 20153300123, 20172400322, 20172400436], dtype='int64', name='Código')
Index(['Nota1', 'Nota2', 'Nota3'], dtype='object')


**Agregando índices**

In [14]:
d = {'one' : [1., 2., 3., 4.],'two' : [4., 3., 2., 1.]}
df1= pd.DataFrame(d)
print(df1)
df2= pd.DataFrame(d, index=['a', 'b', 'c', 'd'])
print(df2)
df1 = df1.set_index([['a','b','c','d']])
print(df1)

   one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0
   one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0
   one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0


**Creando un DF de un diccionario de Series**

In [15]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print(df)

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0


In [16]:
df1 = pd.DataFrame(d, index=['d', 'b', 'a'])
print(df1)

   one  two
d  NaN  4.0
b  2.0  2.0
a  1.0  1.0


In [17]:
df2 = pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
print(df2)

   two three
d  4.0   NaN
b  2.0   NaN
a  1.0   NaN




---

### Crear un DataFrame a partir de una lista de diccinarios


---



In [18]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data2)
print(df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [19]:
df1 = pd.DataFrame(data2, index=['first', 'second'])
print(df1)

        a   b     c
first   1   2   NaN
second  5  10  20.0


In [20]:
df2 = pd.DataFrame(data2, index=['first', 'second'], columns=['a', 'b'])
print(df2)

        a   b
first   1   2
second  5  10




---

### Crear un DataFrame a partir de un arreglo de Numpy


---



In [21]:
a = np.random.randint(low=0, high=10, size=(5, 5))
print(a)
df2 = pd.DataFrame(data=a)
print(df2)

[[6 1 6 8 2]
 [5 9 8 9 1]
 [6 3 5 4 8]
 [4 1 2 5 0]
 [5 9 7 6 0]]
   0  1  2  3  4
0  6  1  6  8  2
1  5  9  8  9  1
2  6  3  5  4  8
3  4  1  2  5  0
4  5  9  7  6  0


**Definiendo las Columnas**

In [22]:
df3 = pd.DataFrame(data=a, columns=['punt1', 'punt2', 'punt3', 'punt4', 'punt5'])
df3

Unnamed: 0,punt1,punt2,punt3,punt4,punt5
0,6,1,6,8,2
1,5,9,8,9,1
2,6,3,5,4,8
3,4,1,2,5,0
4,5,9,7,6,0


**Definiendo los Índices**

In [23]:
df3 = pd.DataFrame(data=a, columns=['punt1', 'punt2', 'punt3', 'punt4', 'punt5'], index = ['est1', 'est2', 'est3', 'est4', 'est5'])
df3

Unnamed: 0,punt1,punt2,punt3,punt4,punt5
est1,6,1,6,8,2
est2,5,9,8,9,1
est3,6,3,5,4,8
est4,4,1,2,5,0
est5,5,9,7,6,0


**Renombrando Índices y Columnas**

In [24]:
df2.rename(columns={0: "a", 1: "b", 2: "c", 3: "d", 4: "e"})

Unnamed: 0,a,b,c,d,e
0,6,1,6,8,2
1,5,9,8,9,1
2,6,3,5,4,8
3,4,1,2,5,0
4,5,9,7,6,0


In [25]:
df2.rename(index={0: "a", 1: "b", 2: "c", 3: "d", 4: "e"})

Unnamed: 0,0,1,2,3,4
a,6,1,6,8,2
b,5,9,8,9,1
c,6,3,5,4,8
d,4,1,2,5,0
e,5,9,7,6,0


In [26]:
df3.rename(columns={0: "a", 'punt2': "b", 2: "c", 3: "d", 4: "e"})

Unnamed: 0,punt1,b,punt3,punt4,punt5
est1,6,1,6,8,2
est2,5,9,8,9,1
est3,6,3,5,4,8
est4,4,1,2,5,0
est5,5,9,7,6,0




---
### Salvar a CSV


---




In [27]:
df3.to_csv('notas.csv')



---

### Crear un DataFrame desde un CSV


---



https://www.datos.gov.co/ -> Descubre -> Docentes de planta

In [0]:
docentes = pd.read_csv('Docentes_De_Planta_2017-2.csv')

FileNotFoundError: ignored

In [0]:
docentes

**Conociendo el Data Set importado**

In [0]:
docentes.head()

In [0]:
docentes.tail()

In [0]:
docentes.columns

In [0]:
docentes.index

NameError: ignored

In [0]:
docentes.dtypes

In [0]:
docentes.describe()

**Configurar un índice**

In [28]:
docentes = pd.read_csv('Casos_positivos_de_COVID-19_en_Colombia.csv', index_col = 'ID de caso')
docentes

Unnamed: 0_level_0,Fecha de notificación,Codigo DIVIPOLA,Ciudad de ubicación,Departamento o Distrito,atención,Edad,Sexo,Tipo,Estado,País de procedencia,FIS,Fecha de muerte,Fecha diagnostico,Fecha recuperado,fecha reporte web
ID de caso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,2020-03-02T00:00:00.000,11001,Bogotá D.C.,Bogotá D.C.,Recuperado,19,F,Importado,Leve,Italia,2020-02-27T00:00:00.000,- -,2020-03-06T00:00:00.000,2020-03-13T00:00:00.000,2020-03-06T00:00:00.000
2,2020-03-06T00:00:00.000,76111,Guadalajara de Buga,Valle del Cauca,Recuperado,34,M,Importado,Leve,España,2020-03-04T00:00:00.000,- -,2020-03-09T00:00:00.000,2020-03-19T00:00:00.000,2020-03-09T00:00:00.000
3,2020-03-07T00:00:00.000,5001,Medellín,Antioquia,Recuperado,50,F,Importado,Leve,España,2020-02-29T00:00:00.000,- -,2020-03-09T00:00:00.000,2020-03-15T00:00:00.000,2020-03-09T00:00:00.000
4,2020-03-09T00:00:00.000,5001,Medellín,Antioquia,Recuperado,55,M,Relacionado,Leve,Colombia,2020-03-06T00:00:00.000,- -,2020-03-11T00:00:00.000,2020-03-26T00:00:00.000,2020-03-11T00:00:00.000
5,2020-03-09T00:00:00.000,5001,Medellín,Antioquia,Recuperado,25,M,Relacionado,Leve,Colombia,2020-03-08T00:00:00.000,- -,2020-03-11T00:00:00.000,2020-03-23T00:00:00.000,2020-03-11T00:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10531,2020-05-07T00:00:00.000,11001,Bogotá D.C.,Bogotá D.C.,Casa,21,M,En estudio,Leve,Colombia,2020-05-03T00:00:00.000,- -,2020-05-09T00:00:00.000,- -,2020-05-09T00:00:00.000
10532,2020-05-07T00:00:00.000,11001,Bogotá D.C.,Bogotá D.C.,Casa,29,M,En estudio,Leve,Colombia,2020-05-03T00:00:00.000,- -,2020-05-09T00:00:00.000,- -,2020-05-09T00:00:00.000
10533,2020-05-07T00:00:00.000,11001,Bogotá D.C.,Bogotá D.C.,Casa,36,F,Relacionado,Leve,Colombia,2020-05-04T00:00:00.000,- -,2020-05-09T00:00:00.000,- -,2020-05-09T00:00:00.000
10534,2020-04-26T00:00:00.000,91001,Leticia,Amazonas,Fallecido,64,M,En estudio,Fallecido,Colombia,2020-04-26T00:00:00.000,2020-05-07T00:00:00.000,2020-05-09T00:00:00.000,- -,2020-05-09T00:00:00.000
