# Manipulação de Dados com Pandas - Python

In [1]:
import pandas as pd
import numpy as np

In [2]:
series = ([7,4,2,np.nan,6,9])
series

[7, 4, 2, nan, 6, 9]

In [3]:
type(series)

list

In [4]:
# obtendo datas

data = pd.date_range('20180101', periods = 6)

In [5]:
data

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
# criando um dataframe com números aleatórios e usando as datas criadas anteiormente

df = pd.DataFrame(np.random.randn(6,4), index = data, columns = list('ABCD'))

# randn(6,4), ou seja, chamamos 6 linnhas e 4 colunas com números random, colocando como index a data que criamos anteriormente

In [8]:
df

Unnamed: 0,A,B,C,D
2018-01-01,-1.083591,-0.849522,0.439021,-1.771952
2018-01-02,0.72209,0.645839,0.602345,-0.248874
2018-01-03,-0.505782,0.189015,0.014541,-0.841502
2018-01-04,0.950462,0.365604,-0.930167,0.470656
2018-01-05,-0.263711,0.208123,-0.258834,-0.161585
2018-01-06,0.488903,-1.00988,2.07923,-0.484692


In [10]:
# vendo o tipo

type(df)

pandas.core.frame.DataFrame

In [11]:
#outra forma de criar um dataframe

df2 = pd.DataFrame({'A': 7,
                   'B': pd.Timestamp('20190101'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(['test', 'train', 'test', 'train']),
                   'F': 'Python'})

In [12]:
df2

Unnamed: 0,A,B,C,D,E,F
0,7,2019-01-01,1.0,3,test,Python
1,7,2019-01-01,1.0,3,train,Python
2,7,2019-01-01,1.0,3,test,Python
3,7,2019-01-01,1.0,3,train,Python


In [14]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Criando um Terceiro DataFrame

In [18]:
data = pd.date_range('20190101', periods = 60, freq='D')

df = pd.DataFrame(np.random.randn(60,5), index = data, columns = list('ABCDE'))

In [19]:
df

Unnamed: 0,A,B,C,D,E
2019-01-01,1.279378,1.714901,-2.74828,0.563366,-0.781734
2019-01-02,-1.923841,-0.116341,-0.753045,0.921399,-0.685622
2019-01-03,0.819105,1.664087,0.76896,1.256874,0.197748
2019-01-04,0.092787,-0.525227,0.244183,1.970564,0.938943
2019-01-05,0.341919,-0.715365,1.363344,1.447022,1.215355
2019-01-06,1.22949,-0.653691,-1.671514,0.340238,0.98158
2019-01-07,-0.632821,1.608482,-0.648588,1.190914,0.240317
2019-01-08,-1.916005,0.089007,1.035161,1.117194,0.367738
2019-01-09,0.196397,-1.427208,-0.196197,-1.891822,0.493924
2019-01-10,0.937249,-0.662815,-1.231659,-1.109656,0.475241


In [20]:
# vendo a dimensão do dataframe

df.shape

(60, 5)

## Inserindo uma Nova Coluna

In [21]:
df['F'] = 1

In [22]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F
2019-01-01,1.279378,1.714901,-2.74828,0.563366,-0.781734,1
2019-01-02,-1.923841,-0.116341,-0.753045,0.921399,-0.685622,1
2019-01-03,0.819105,1.664087,0.76896,1.256874,0.197748,1
2019-01-04,0.092787,-0.525227,0.244183,1.970564,0.938943,1
2019-01-05,0.341919,-0.715365,1.363344,1.447022,1.215355,1
2019-01-06,1.22949,-0.653691,-1.671514,0.340238,0.98158,1
2019-01-07,-0.632821,1.608482,-0.648588,1.190914,0.240317,1
2019-01-08,-1.916005,0.089007,1.035161,1.117194,0.367738,1
2019-01-09,0.196397,-1.427208,-0.196197,-1.891822,0.493924,1
2019-01-10,0.937249,-0.662815,-1.231659,-1.109656,0.475241,1


In [23]:
# inserindo uma coluna com os valores seguindo uma sequência

df['G'] = range(60)

In [24]:
df.head(15)

Unnamed: 0,A,B,C,D,E,F,G
2019-01-01,1.279378,1.714901,-2.74828,0.563366,-0.781734,1,0
2019-01-02,-1.923841,-0.116341,-0.753045,0.921399,-0.685622,1,1
2019-01-03,0.819105,1.664087,0.76896,1.256874,0.197748,1,2
2019-01-04,0.092787,-0.525227,0.244183,1.970564,0.938943,1,3
2019-01-05,0.341919,-0.715365,1.363344,1.447022,1.215355,1,4
2019-01-06,1.22949,-0.653691,-1.671514,0.340238,0.98158,1,5
2019-01-07,-0.632821,1.608482,-0.648588,1.190914,0.240317,1,6
2019-01-08,-1.916005,0.089007,1.035161,1.117194,0.367738,1,7
2019-01-09,0.196397,-1.427208,-0.196197,-1.891822,0.493924,1,8
2019-01-10,0.937249,-0.662815,-1.231659,-1.109656,0.475241,1,9


In [25]:
# criando uma coluna 'produto'

df['Produto'] = df['A'] * df['B']

In [26]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F,G,Produto
2019-01-01,1.279378,1.714901,-2.74828,0.563366,-0.781734,1,0,2.194007
2019-01-02,-1.923841,-0.116341,-0.753045,0.921399,-0.685622,1,1,0.223822
2019-01-03,0.819105,1.664087,0.76896,1.256874,0.197748,1,2,1.363062
2019-01-04,0.092787,-0.525227,0.244183,1.970564,0.938943,1,3,-0.048734
2019-01-05,0.341919,-0.715365,1.363344,1.447022,1.215355,1,4,-0.244597
2019-01-06,1.22949,-0.653691,-1.671514,0.340238,0.98158,1,5,-0.803707
2019-01-07,-0.632821,1.608482,-0.648588,1.190914,0.240317,1,6,-1.017881
2019-01-08,-1.916005,0.089007,1.035161,1.117194,0.367738,1,7,-0.170539
2019-01-09,0.196397,-1.427208,-0.196197,-1.891822,0.493924,1,8,-0.2803
2019-01-10,0.937249,-0.662815,-1.231659,-1.109656,0.475241,1,9,-0.621223


In [27]:
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Produto'], dtype='object')

In [32]:
# escolhendo apenas os dados, sem o index

df.to_numpy()

array([[ 1.27937850e+00,  1.71490091e+00, -2.74828024e+00,
         5.63366160e-01, -7.81733676e-01,  1.00000000e+00,
         0.00000000e+00,  2.19400734e+00],
       [-1.92384133e+00, -1.16341373e-01, -7.53044777e-01,
         9.21398988e-01, -6.85621708e-01,  1.00000000e+00,
         1.00000000e+00,  2.23822341e-01],
       [ 8.19104695e-01,  1.66408732e+00,  7.68960493e-01,
         1.25687351e+00,  1.97748159e-01,  1.00000000e+00,
         2.00000000e+00,  1.36306174e+00],
       [ 9.27871624e-02, -5.25227343e-01,  2.44182834e-01,
         1.97056394e+00,  9.38942926e-01,  1.00000000e+00,
         3.00000000e+00, -4.87343548e-02],
       [ 3.41919122e-01, -7.15365044e-01,  1.36334435e+00,
         1.44702201e+00,  1.21535514e+00,  1.00000000e+00,
         4.00000000e+00, -2.44596988e-01],
       [ 1.22949011e+00, -6.53691296e-01, -1.67151414e+00,
         3.40238037e-01,  9.81580253e-01,  1.00000000e+00,
         5.00000000e+00, -8.03706981e-01],
       [-6.32820606e-01,  1.608482

In [33]:
# transposta de um conjunto de dados

df.T

Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06,2019-01-07,2019-01-08,2019-01-09,2019-01-10,...,2019-02-20,2019-02-21,2019-02-22,2019-02-23,2019-02-24,2019-02-25,2019-02-26,2019-02-27,2019-02-28,2019-03-01
A,1.279378,-1.923841,0.819105,0.092787,0.341919,1.22949,-0.632821,-1.916005,0.196397,0.937249,...,-0.104012,-0.444725,1.019039,0.03152,-0.708232,-0.136171,0.640434,0.632602,0.348968,0.000335
B,1.714901,-0.116341,1.664087,-0.525227,-0.715365,-0.653691,1.608482,0.089007,-1.427208,-0.662815,...,0.563192,-0.255854,-0.775222,1.760072,-0.325284,1.124502,1.133563,-0.574251,0.580816,1.027228
C,-2.74828,-0.753045,0.76896,0.244183,1.363344,-1.671514,-0.648588,1.035161,-0.196197,-1.231659,...,-0.276563,-0.566205,0.6065,-0.24201,1.046064,0.334121,1.59309,1.547414,-0.283067,1.017993
D,0.563366,0.921399,1.256874,1.970564,1.447022,0.340238,1.190914,1.117194,-1.891822,-1.109656,...,-2.596532,-1.83405,0.081105,0.480712,1.397138,-0.476751,-0.585275,-0.601216,-0.339407,-1.88501
E,-0.781734,-0.685622,0.197748,0.938943,1.215355,0.98158,0.240317,0.367738,0.493924,0.475241,...,0.148667,-1.190525,-0.293647,0.737353,0.806718,0.610292,-0.765074,0.209687,1.387002,0.91615
F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
G,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0
Produto,2.194007,0.223822,1.363062,-0.048734,-0.244597,-0.803707,-1.017881,-0.170539,-0.2803,-0.621223,...,-0.058579,0.113785,-0.789981,0.055477,0.230377,-0.153125,0.725972,-0.363272,0.202686,0.000344


## Combinando DataFrames

In [34]:
df3 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                   'B': ['B0', 'B1', 'B2', 'B3'],
                   'C': ['C0', 'C1', 'C2', 'C3'],
                   'D': ['D0', 'D1', 'D2', 'D3']},
                  index=[0,1,2,3])

In [35]:
df4 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                   'B': ['B4', 'B5', 'B6', 'B7'],
                   'C': ['C4', 'C5', 'C6', 'C7'],
                   'D': ['D4', 'D5', 'D6', 'D7']},
                  index=[4,5,6,7])

In [36]:
df5 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                   'B': ['B8', 'B9', 'B10', 'B11'],
                   'C': ['C8', 'C9', 'C10', 'C11'],
                   'D': ['D8', 'D9', 'D10', 'D11']},
                  index=[8,9,10,11])

In [37]:
# uma forma de concatenar

frames = [df3, df4, df5]

In [38]:
frames

[    A   B   C   D
 0  A0  B0  C0  D0
 1  A1  B1  C1  D1
 2  A2  B2  C2  D2
 3  A3  B3  C3  D3,
     A   B   C   D
 4  A4  B4  C4  D4
 5  A5  B5  C5  D5
 6  A6  B6  C6  D6
 7  A7  B7  C7  D7,
       A    B    C    D
 8    A8   B8   C8   D8
 9    A9   B9   C9   D9
 10  A10  B10  C10  D10
 11  A11  B11  C11  D11]

In [40]:
type(frames)

list

In [42]:
# alterando o frames e transformando a lista

framescombinados = pd.concat(frames)
framescombinados

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [43]:
#melhor para trabalhar com dados combinados.

type(framescombinados)

pandas.core.frame.DataFrame

In [46]:
# outra forma de concatenar os dados

grupo = pd.concat([df3, df4, df5], keys=['f1', 'f2', 'f3'])
grupo

Unnamed: 0,Unnamed: 1,A,B,C,D
f1,0,A0,B0,C0,D0
f1,1,A1,B1,C1,D1
f1,2,A2,B2,C2,D2
f1,3,A3,B3,C3,D3
f2,4,A4,B4,C4,D4
f2,5,A5,B5,C5,D5
f2,6,A6,B6,C6,D6
f2,7,A7,B7,C7,D7
f3,8,A8,B8,C8,D8
f3,9,A9,B9,C9,D9


In [47]:
# filtrando pela chave

grupo.loc['f2']

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [48]:
# outra forma de concatenação

g2 =  df3.append(df4).append(df5)
g2

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9
