# DataFrames

#### Dataframes é o burro de carga do pandas e foi fortemente inspirado na linguagem de programação R. Nós podemos pensar no Dataframe como um punhado de objetos Series colocados juntos para compartilhar do mesmo index. Vamos utilizar o pandas para explorar melhor esse tópico!

In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn
np.random.seed(101)

In [3]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])

In [4]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Selecionando e Indexando

Existem várias formas de se obter os dados que estão em um DataFrame

In [5]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [6]:
type(df['W']) # Vemos que a coluna W é um Series

pandas.core.series.Series

In [7]:
type(df) # Vemos que df é um DataFrame

pandas.core.frame.DataFrame

In [8]:
df[['W', 'Z']] # Chamar os dados dessa forma retorna outro DataFrame

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


### Criando uma nova coluna:

In [9]:
df['new'] = df['W'] + df['Y'] # Podemos adicionar aqui uma nova coluna realizando uma operação aritmética com outras duas

In [10]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [11]:
df.drop('new', axis = 1) # A remoção só vira mesmo com a adição do argumento 'inplace'.

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [12]:
df.drop('new', axis=1 , inplace =  True) # Deletando uma coluna do df. O argumento 'inplace' vem para evitar erros de perca de dados

In [13]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [14]:
df.drop('E', axis = 0) # 0 para linhas e 1 para colunas, utilizando o axis (eixos)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [15]:
df.shape

(5, 4)

Selecionando linhas:

In [16]:
df.loc['A'] # Aqui o método 'loc' retorna um series da linha A

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [17]:
df.iloc[0] # O método 'iloc' permite passar um número como index para retornar um series 

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [18]:
df.loc['B','Y'] # Retornando o valor da linha 'B' coluna 'Y'

-0.8480769834036315

In [19]:
df.loc[['A', 'B'], ['W', 'Y']] # Aqui estamos escolhendo as linhas e colunas desejadas dentro do DataFrame

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


### Seleção Condicional

In [20]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [21]:
booldf = df > 0 

In [22]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [23]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [24]:
# Podemos pular algumas etapas e criar um DataFrame baseado em uma condição direto em somente uma linha:
df[df>0] 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [25]:
df['W'] > 0 # Aqui eu tenho o condicional em uma coluna retornando um series:

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [26]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [27]:
df[df['W']>0] # Utilizando este método somente as colunas com resultado verdadeiro entrarão no novo dataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [28]:
df[df['Z']<0] # Selecionando a linha onde um valor de 'Z' é menor que 0:

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [29]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
resultdf = df[df['W']>0]

In [31]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [32]:
resultdf['X'] 

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [33]:
# Agora podemos fazer os 3 passos anteriores em somente 1 passo:
df[df['W']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [34]:
# E como é um DataFrame eu posso selecionar várias colunas repassando uma lista:
df[df['W']>0][['X', 'Y', 'Z']]

Unnamed: 0,X,Y,Z
A,0.628133,0.907969,0.503826
B,-0.319318,-0.848077,0.605965
D,-0.758872,-0.933237,0.955057
E,1.978757,2.605967,0.683509


### Múltiplas Condições

In [35]:
df[(df['W']>0) and (df['Y']>1)] # A verdade fica ambígua por comparar uma série de booleanos com outra série de booleanos

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [36]:
# Então, para fazer esse comparativos mais complexos de múltiplas condições com DataFrames, precisamos utilizar o '&':
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [37]:
# Também existe o 'ou':
df[(df['W']>0) | (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Mais detalhes de index:

Vamos discutir algumas questões sobre indexação, como resetar e hierarquia de index:

In [38]:
df.reset_index() # Aqui resetando o index de df e, assim atribuindo novamente os valores numéricos, fazendo com que o index
# antigo vire uma coluna do seu DataFrame

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [39]:
newind = 'CA NY WY OR CO'.split() # Criando uma nova coluna que irá se tornar o novo index do DataFrame

In [40]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [41]:
df['States'] = newind

In [42]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [43]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [44]:
# E para mudar definitivamente o DataFrame é necessário alterar o argumento 'inplace = True'

### Multi-index e Hierarquia de Index

Vamos ver como se trabalha com multiindex, começando com um exemplo de como um Dataframe com multi-index parece:

In [45]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [46]:
df = pd.DataFrame(randn(6,2), hier_index, ['A', 'B'])
"""
Explicando o método DataFrame() a partir da linha acima. Os 3 argumentos são: 1 data -> é de onde estamos tirando informa
ção para popular o dataframe. 2 index -> de onde estamos tirando os index do dataframe e 3 colunas. Aqui o professor ensina
como criar um DataFrame com múltiplos index, para criar DataFrames mais complexos.
"""

'\nExplicando o método DataFrame() a partir da linha acima. Os 3 argumentos são: 1 data -> é de onde estamos tirando informa\nção para popular o dataframe. 2 index -> de onde estamos tirando os index do dataframe e 3 colunas. Aqui o professor ensina\ncomo criar um DataFrame com múltiplos index, para criar DataFrames mais complexos.\n'

In [47]:
df
# Isso é o que chamam de index hierarchy

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [48]:
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [49]:
df.loc['G1'].loc[1] # Retornando aqui um series

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [50]:
df.index.names

FrozenList([None, None])

In [51]:
df.index.names = ['Groups', 'Num']

In [52]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [53]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

In [54]:
# Se o objetivo for retornar informações sobre G1:
df.loc['G1']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [55]:
# Temos a opção de usar o método 'xs' (cross section):
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [56]:
# O método cross section vai permitir o retorno de informações específicas de um dataset de uma forma mais fácil.
df.xs(1, level = 'Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
