In [14]:
import pandas as pd
import numpy as np

# Series

Series são bastante similares aos arrays Numpy, com a exceção de que Series podem armazenar qualquer tipo de valor válido em python e não apenas números. Series também possuem labels, que podem ser valores não-numéricos escolhidos pelo usuário para o index, algo que também não é possível nos arrays numpy

## Creating Series

Series podem ser criados a partir de listas, numpy arrays ou dicionários

In [50]:
# A partir de uma lista
labels = ['a', 'b', 'c']

pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [51]:
# A partir de um dicionário
# As chaves do dicionário tornam-se os index do Series

dic = {'a': 1, 'b': 2, 'c': 3}

pd.Series(dic)

a    1
b    2
c    3
dtype: int64

In [52]:
# A partir de um array Numpy
arr = np.arange(0,6,2)

pd.Series(arr)

0    0
1    2
2    4
dtype: int64

In [53]:
# É possível passar indexes e labels na inicialização do dataframe
pd.Series(data=arr, index=labels)

a    0
b    2
c    4
dtype: int64

## Using the index

In [54]:
ser1 = pd.Series(data=[1,2,3,4], index=['Brazil', 'Norway', 'Indonesia', 'Iceland'])
ser1

Brazil       1
Norway       2
Indonesia    3
Iceland      4
dtype: int64

In [55]:
ser2 = pd.Series(data=[1,2,5,4], index=['Brazil', 'Norway', 'Chile', 'Iceland'])
ser2

Brazil     1
Norway     2
Chile      5
Iceland    4
dtype: int64

In [56]:
ser1['Brazil']

1

In [57]:
# Operações são feitas com base nos indexes
# Caso ambos os Series não possuam os mesmos indexes, NaN será alocado nos indexes que não se correspondem
ser1 + ser2

Brazil       2.0
Chile        NaN
Iceland      8.0
Indonesia    NaN
Norway       4.0
dtype: float64

# Dataframes

Dataframes são um simplesmente um conjunto de Series agrupadas. Ou seja, dataframes são compostos de Series

In [96]:
df = pd.DataFrame(data=np.random.randn(5,4), index='A B C D E'.split(), columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,-0.827682,-0.108426,-0.786819,-0.25429
B,0.653695,1.009565,0.516073,-0.397787
C,-0.317639,1.09711,0.401081,-0.407352
D,-1.270196,-0.686383,0.950081,-0.775337
E,-0.712557,0.909844,1.690329,-0.829519


## Selection and Indexing

df[] -> Para selecionar colunas

df.loc[] -> para selecionar linhas

In [97]:
df['W']

A   -0.827682
B    0.653695
C   -0.317639
D   -1.270196
E   -0.712557
Name: W, dtype: float64

In [98]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,-0.827682,-0.25429
B,0.653695,-0.397787
C,-0.317639,-0.407352
D,-1.270196,-0.775337
E,-0.712557,-0.829519


In [99]:
# Colunas de um DataFrame são apenas Series
type(df['W'])

pandas.core.series.Series

In [100]:
# Criar uma nova coluna
df['H'] = df['W'] + df['Z']
df

Unnamed: 0,W,X,Y,Z,H
A,-0.827682,-0.108426,-0.786819,-0.25429,-1.081972
B,0.653695,1.009565,0.516073,-0.397787,0.255908
C,-0.317639,1.09711,0.401081,-0.407352,-0.724991
D,-1.270196,-0.686383,0.950081,-0.775337,-2.045533
E,-0.712557,0.909844,1.690329,-0.829519,-1.542076


In [101]:
# Remover colunas
# As alterações não são feitas no DataFrame de origem a menos que especificado
df.drop(labels='H', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-0.827682,-0.108426,-0.786819,-0.25429
B,0.653695,1.009565,0.516073,-0.397787
C,-0.317639,1.09711,0.401081,-0.407352
D,-1.270196,-0.686383,0.950081,-0.775337
E,-0.712557,0.909844,1.690329,-0.829519


In [102]:
# Remover linhas
# Basta alterar o valor do axis/eixo
df.drop(labels='A', axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z
B,0.653695,1.009565,0.516073,-0.397787
C,-0.317639,1.09711,0.401081,-0.407352
D,-1.270196,-0.686383,0.950081,-0.775337
E,-0.712557,0.909844,1.690329,-0.829519


In [103]:
# Selecionar linhas de acordo com o label
df.loc[['B', 'C']]

Unnamed: 0,W,X,Y,Z
B,0.653695,1.009565,0.516073,-0.397787
C,-0.317639,1.09711,0.401081,-0.407352


In [104]:
# Selecionar linhas de acordo com o index
df.iloc[[1,2]]

Unnamed: 0,W,X,Y,Z
C,-0.317639,1.09711,0.401081,-0.407352
D,-1.270196,-0.686383,0.950081,-0.775337


In [105]:
# Selecionar subconjuntos de linhas e colunas
df.loc[['B', 'C'], ['W', 'Z']]

# Ou

df[['W', 'Z']].loc[['B', 'C']]

Unnamed: 0,W,Z
B,0.653695,-0.397787
C,-0.317639,-0.407352


In [106]:
# Selecionar um valor específico
df.loc['B', 'Z']

-0.39778702814823913

## Conditional Selecting

Selecionar subconjuntos de colunas e linhas de acordo a condição indicada, semelhante ao numpy


In [107]:
df > 0

Unnamed: 0,W,X,Y,Z
B,True,True,True,False
C,False,True,True,False
D,False,False,True,False
E,False,True,True,False


In [108]:
# Os valores que não satisfazem a condição serão substituídos por NaN
df[df>0]

Unnamed: 0,W,X,Y,Z
B,0.653695,1.009565,0.516073,
C,,1.09711,0.401081,
D,,,0.950081,
E,,0.909844,1.690329,


In [109]:
# Filtrar apenas as linhas do DataFrame onde a condição foi satisfeita
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
B,0.653695,1.009565,0.516073,-0.397787


In [111]:
df[df['W'] > 0][['X', 'Y']]

Unnamed: 0,X,Y
B,1.009565,0.516073


In [112]:
#Para mútimas condições, utilizar a sintaxe abaixo
df[(df['W'] > 0) & (df['X'] > 0)]

Unnamed: 0,W,X,Y,Z
B,0.653695,1.009565,0.516073,-0.397787


In [113]:
df[(df['W']>0) | (df['X']>0)]

Unnamed: 0,W,X,Y,Z
B,0.653695,1.009565,0.516073,-0.397787
C,-0.317639,1.09711,0.401081,-0.407352
E,-0.712557,0.909844,1.690329,-0.829519


## Indexing features

In [114]:
# Resetar o index para o valor numérico padrão (0,1,2,etc)
# Alterações não são feitas no DataFrame original a menos que especificado com inplace=True
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,B,0.653695,1.009565,0.516073,-0.397787
1,C,-0.317639,1.09711,0.401081,-0.407352
2,D,-1.270196,-0.686383,0.950081,-0.775337
3,E,-0.712557,0.909844,1.690329,-0.829519


In [115]:
# Especificar uma coluna para o index
df['Novo index'] = 'BA CE DF GO'.split()
df

Unnamed: 0,W,X,Y,Z,Novo index
B,0.653695,1.009565,0.516073,-0.397787,BA
C,-0.317639,1.09711,0.401081,-0.407352,CE
D,-1.270196,-0.686383,0.950081,-0.775337,DF
E,-0.712557,0.909844,1.690329,-0.829519,GO


In [116]:
df.set_index(keys='Novo index', inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
Novo index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BA,0.653695,1.009565,0.516073,-0.397787
CE,-0.317639,1.09711,0.401081,-0.407352
DF,-1.270196,-0.686383,0.950081,-0.775337
GO,-0.712557,0.909844,1.690329,-0.829519


In [117]:
# A coluna se torna um index
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

## Multi-index and Index Hierarchy

In [129]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [130]:
df = pd.DataFrame(data=np.random.randn(6,6), columns=hier_index)
df

Unnamed: 0_level_0,G1,G1,G1,G2,G2,G2
Unnamed: 0_level_1,1,2,3,1,2,3
0,-0.348314,-0.110165,-0.317371,0.801565,-0.288864,-0.412595
1,1.209428,0.471026,0.632286,0.54285,0.206851,1.45002
2,1.352358,0.559334,1.760988,-0.639983,1.874042,-0.547077
3,-0.552415,-0.435224,1.679858,0.445369,-0.598145,-0.081073
4,0.354442,0.067879,0.73565,0.552043,0.066962,0.601251
5,0.017123,-1.256741,-0.438706,-1.688275,0.812066,-0.576715


In [132]:
df['G1']

Unnamed: 0,1,2,3
0,-0.348314,-0.110165,-0.317371
1,1.209428,0.471026,0.632286
2,1.352358,0.559334,1.760988
3,-0.552415,-0.435224,1.679858
4,0.354442,0.067879,0.73565
5,0.017123,-1.256741,-0.438706


In [133]:
df['G1'].loc[[2,3]]

Unnamed: 0,1,2,3
2,1.352358,0.559334,1.760988
3,-0.552415,-0.435224,1.679858


In [134]:
#Ver os nomes dos indexes
df.index.names

FrozenList([None])

In [159]:
#Dar nome aos indexes
df.index.names = ['Index']
df

Unnamed: 0_level_0,Company,Person,Sales
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


# Missing Values

In [135]:
d = {
    'A':[1, 2, np.nan],
    'B': [5, np.nan, np.nan],
    'C': [1, 2, 3]
}

df = pd.DataFrame(data=d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [136]:
# Remover todas as linhas/colunas que contenham NaN
# As modificações não serão aplicadas no dataframe a menos que especificado com o parâmetro inplace=True
# Axis 0 faz referências às linhas e 1 às colunas
df.dropna(axis=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [137]:
#Indica quantos NaN serão removidos
df.dropna(thresh=3)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [138]:
#Substituir NaN por um valor padrão definido
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

# Group By

In [139]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
       
df = pd.DataFrame(data=data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [140]:
#Retorna um objeto do tipo DataFrameGroupBy
df_g = df.groupby('Company')
df_g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f16309aaed0>

In [141]:
# Funções de agregação podem então ser chamadas desse objeto
df_g.count() #NaN não são contatos

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [142]:
# Exemplos de funções de agregação
df_g.sum()
df_g.max()
df_g.min()
df_g.mean()
df_g.std()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,75.660426
GOOG,56.568542
MSFT,152.735065


In [143]:
# O mais comum é fazer tudo isso em uma única linha
df.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [144]:
# Para criar um dataframe contendo todos os resultados das funções de agregação
df.groupby('Company').describe() #resultados por linhas
df.groupby('Company').describe().transpose() #resultados por colunas

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


# Merging, Joining and concatenating



Existem 3 principais formas de juntar dataframes

## Concatenating

Simplesmente combina os dados dos dataframes

In [145]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [146]:
pd.concat([df1, df2, df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [147]:
# Para que a junção ocorra nas colunas
# É importante sempre verificar se a estrutura dos dataframes são equivalentes
pd.concat([df1, df2, df3], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


## Merging

Junta dois dataframes com base em uma coluna em comum

In [148]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})    

In [149]:
pd.merge(left, right, how='inner', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [150]:
#Exemplos mais complexos
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

In [151]:
pd.merge(left, right, on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [152]:
pd.merge(left, right, how='outer', on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,
5,K2,K0,,,C3,D3


In [153]:
pd.merge(left, right, how='right', on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


In [154]:
pd.merge(left, right, how='left', on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


## Joining

Combina dados de dataframes com indexes potencialmente diferentes

In [155]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [156]:
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [157]:
left.join(right, how='outer')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,,,C3,D3
