# Aprendendo Pandas com Pandas


In [None]:
# Os imports são feitos, normalmente, desta forma
import numpy as np
import pandas as pd

# Criando Series a partir de uma lista, deixando pandas criar índices para estes valores
s = pd.Series([1, 2, 3, np.nan, 6, 7])
s

0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    7.0
dtype: float64

## Criando um DataFrame

Além do DataFrame, índices cronológicos (datas) são criados usando `data_range()`.

In [None]:
import pandas as pd

# Criando os índices cronológicos
dates = pd.date_range("20130101", periods=6)

dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df = pd.DataFrame(
    np.random.randn(6, 4),
    index = dates,
    columns = list("ABCD")
)

df

Unnamed: 0,A,B,C,D
2013-01-01,-0.285647,1.808935,-0.200066,1.102188
2013-01-02,-0.305765,0.518399,0.76847,0.030492
2013-01-03,-0.618378,-0.805839,0.68589,-1.797472
2013-01-04,-0.751577,-0.263927,0.705221,0.43653
2013-01-05,-0.340801,0.383806,-1.762262,-0.686816
2013-01-06,-2.014722,0.644343,1.722291,-1.420497


Criando um `DataFrame` passando um dicionário de objetos.

In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20020912"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2002-09-12,1.0,3,test,foo
1,1.0,2002-09-12,1.0,3,train,foo
2,1.0,2002-09-12,1.0,3,test,foo
3,1.0,2002-09-12,1.0,3,train,foo


## Visualizando Dados

In [None]:
# Permite visualizar a porção superio do dataframe
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,0.494731,0.348467,-1.789281,-2.51965
2013-01-02,1.911679,-1.055254,-1.084973,0.865618
2013-01-03,0.112174,0.225342,-0.422357,-1.163655


In [None]:
# Permite visualizar a porção inferior do dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.630929,-0.402589,0.027569,-0.561873
2013-01-05,1.267571,-0.032908,0.98237,-1.430458
2013-01-06,-1.327789,0.469253,-0.592701,-0.452456


In [None]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

Ordenação em ordem decrescente do eixo de índice 1 (colunas).

In [None]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.102188,-0.200066,1.808935,-0.285647
2013-01-02,0.030492,0.76847,0.518399,-0.305765
2013-01-03,-1.797472,0.68589,-0.805839,-0.618378
2013-01-04,0.43653,0.705221,-0.263927,-0.751577
2013-01-05,-0.686816,-1.762262,0.383806,-0.340801
2013-01-06,-1.420497,1.722291,0.644343,-2.014722


Ordenação dos valores a partir da coluna **B**

In [None]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,-0.618378,-0.805839,0.68589,-1.797472
2013-01-04,-0.751577,-0.263927,0.705221,0.43653
2013-01-05,-0.340801,0.383806,-1.762262,-0.686816
2013-01-02,-0.305765,0.518399,0.76847,0.030492
2013-01-06,-2.014722,0.644343,1.722291,-1.420497
2013-01-01,-0.285647,1.808935,-0.200066,1.102188


In [None]:
df["A"]

2013-01-01   -0.285647
2013-01-02   -0.305765
2013-01-03   -0.618378
2013-01-04   -0.751577
2013-01-05   -0.340801
2013-01-06   -2.014722
Freq: D, Name: A, dtype: float64

Filtragem de vlores com `isin()`.

In [None]:
df2 = df.copy()

df2["E"] = ["one", "two", "three", "four", "five", "six"]

df2[df2["E"].isin(["two", "three"])]

Unnamed: 0,A,B,C,D,E
2013-01-02,-0.305765,0.518399,0.76847,0.030492,two
2013-01-03,-0.618378,-0.805839,0.68589,-1.797472,three


## Inserindo informações em colunas

Podemos fazer isto de várias maneiras, seja por atribuição direta, seja pela passagem dos índices.

In [None]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range("20130102", periods=6))

df["F"] = s1

# Inserindo valores pela identificação literal
df.at[dates[0], "A"] = 0

# Inserindo valores pela posição (coluna B)
df.iat[0, 1] = 0

# Inserindo por meio de um NumPy array
df.loc[:, "D"] = np.array([5] * len(df))

df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.200066,5,
2013-01-02,-0.305765,0.518399,0.76847,5,1.0
2013-01-03,-0.618378,-0.805839,0.68589,5,2.0
2013-01-04,-0.751577,-0.263927,0.705221,5,3.0
2013-01-05,-0.340801,0.383806,-1.762262,5,4.0
2013-01-06,-2.014722,0.644343,1.722291,5,5.0


Uma operação `onde` (ou `where`):

In [None]:
df2 = df.copy()

# Onde o valor for maior que 0, substituo pelo seu valor negativo
df2[df2 > 0] = -df2

df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.200066,-5,
2013-01-02,-0.305765,-0.518399,-0.76847,-5,-1.0
2013-01-03,-0.618378,-0.805839,-0.68589,-5,-2.0
2013-01-04,-0.751577,-0.263927,-0.705221,-5,-3.0
2013-01-05,-0.340801,-0.383806,-1.762262,-5,-4.0
2013-01-06,-2.014722,-0.644343,-1.722291,-5,-5.0


## Missing Data

Dados não encontrados podem ser editados de uma maneira simples


In [None]:
df1 = df.reindex(index=dates[0:4], columns = list(df.columns) + ["E"])

df1.loc[dates[0]: dates[1], "E"] = 1

df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.200066,5,,1.0
2013-01-02,-0.305765,0.518399,0.76847,5,1.0,1.0
2013-01-03,-0.618378,-0.805839,0.68589,5,2.0,
2013-01-04,-0.751577,-0.263927,0.705221,5,3.0,


O método `dropna()` retira cada linha que contém um NaN como valor.

In [None]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-0.305765,0.518399,0.76847,5,1.0,1.0


O método `fillna()` preenche os dados vazios com algum valor.

In [None]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.200066,5,5.0,1.0
2013-01-02,-0.305765,0.518399,0.76847,5,1.0,1.0
2013-01-03,-0.618378,-0.805839,0.68589,5,2.0,5.0
2013-01-04,-0.751577,-0.263927,0.705221,5,3.0,5.0


In [None]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True
