In [30]:
import pandas as pd
import numpy as np
from pprint import pprint as pp

[Series e DataFrames](#criação-de-objetos-series-e-dataframe)

[Visualização e informações](#visualização-e-informações)

[Seleção](#seleção)

## Criação de objetos Series e Dataframe

In [23]:
# Serie
s = pd.Series(
    [1, 2, 4, np.nan, 9, 10]
)

# Dataframe
datas = pd.date_range(
    '20240219',
    periods=6
)

df = pd.DataFrame(
    np.random.randn(6, 5),
    index=datas,
    columns=list('ABCDE')
)

df_com_dicionario = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

print('### Series ###')
display(s)

print('### DataFrame ###')
display(df)

print('### DataFrame com dicionario ###')
display(df_com_dicionario)

### Series ###


0     1.0
1     2.0
2     4.0
3     NaN
4     9.0
5    10.0
dtype: float64

### DataFrame ###


Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293
2024-02-22,-1.046419,0.149291,1.214983,0.674852,1.014127
2024-02-23,0.615652,-0.346969,-0.123025,2.125909,0.021837
2024-02-24,-0.059367,-0.565885,0.836691,0.987279,2.166143


### DataFrame com dicionario ###


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## Visualização e informações

In [25]:
df.head(2)

Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978


In [26]:
df.tail(2)

Unnamed: 0,A,B,C,D,E
2024-02-23,0.615652,-0.346969,-0.123025,2.125909,0.021837
2024-02-24,-0.059367,-0.565885,0.836691,0.987279,2.166143


In [32]:
df.index

DatetimeIndex(['2024-02-19', '2024-02-20', '2024-02-21', '2024-02-22',
               '2024-02-23', '2024-02-24'],
              dtype='datetime64[ns]', freq='D')

In [36]:
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [39]:
df.to_numpy()

array([[-0.0802092 , -0.31914183, -0.1713868 ,  0.66715302,  0.46036394],
       [-0.15260481,  0.23678575,  0.16527938, -0.31772655, -1.3597802 ],
       [-1.08872979, -0.39958651, -1.36999297,  0.0520727 , -0.52729298],
       [-1.04641912,  0.14929132,  1.21498288,  0.67485231,  1.01412728],
       [ 0.61565173, -0.34696921, -0.12302462,  2.12590855,  0.02183672],
       [-0.05936673, -0.56588474,  0.83669123,  0.98727946,  2.16614255]])

In [46]:
df.describe()

Unnamed: 0,A,B,C,D,E
count,6.0,6.0,6.0,6.0,6.0
mean,-0.301946,-0.207584,0.092092,0.698257,0.2959
std,0.655068,0.323092,0.902669,0.845342,1.227578
min,-1.08873,-0.565885,-1.369993,-0.317727,-1.35978
25%,-0.822966,-0.386432,-0.159296,0.205843,-0.390011
50%,-0.116407,-0.333056,0.021127,0.671003,0.2411
75%,-0.064577,0.032183,0.668838,0.909173,0.875686
max,0.615652,0.236786,1.214983,2.125909,2.166143


In [48]:
df.T

Unnamed: 0,2024-02-19,2024-02-20,2024-02-21,2024-02-22,2024-02-23,2024-02-24
A,-0.080209,-0.152605,-1.08873,-1.046419,0.615652,-0.059367
B,-0.319142,0.236786,-0.399587,0.149291,-0.346969,-0.565885
C,-0.171387,0.165279,-1.369993,1.214983,-0.123025,0.836691
D,0.667153,-0.317727,0.052073,0.674852,2.125909,0.987279
E,0.460364,-1.35978,-0.527293,1.014127,0.021837,2.166143


In [56]:
display(df.sort_index(axis=1, ascending=True))
display(df.sort_values(by='B', ascending=False))

Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293
2024-02-22,-1.046419,0.149291,1.214983,0.674852,1.014127
2024-02-23,0.615652,-0.346969,-0.123025,2.125909,0.021837
2024-02-24,-0.059367,-0.565885,0.836691,0.987279,2.166143


Unnamed: 0,A,B,C,D,E
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-22,-1.046419,0.149291,1.214983,0.674852,1.014127
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-23,0.615652,-0.346969,-0.123025,2.125909,0.021837
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293
2024-02-24,-0.059367,-0.565885,0.836691,0.987279,2.166143


## Seleção

In [87]:
print('###\nModos de seleção única\n###')
display(df['A'])
display(df.A)

print('###\nModos de seleção por etiqueta\n###')
display(df.loc[datas[0]])

###
Modos de seleção única
###


2024-02-19   -0.080209
2024-02-20   -0.152605
2024-02-21   -1.088730
2024-02-22   -1.046419
2024-02-23    0.615652
2024-02-24   -0.059367
Freq: D, Name: A, dtype: float64

2024-02-19   -0.080209
2024-02-20   -0.152605
2024-02-21   -1.088730
2024-02-22   -1.046419
2024-02-23    0.615652
2024-02-24   -0.059367
Freq: D, Name: A, dtype: float64

###
Modos de seleção por etiqueta
###


A   -0.080209
B   -0.319142
C   -0.171387
D    0.667153
E    0.460364
Name: 2024-02-19 00:00:00, dtype: float64

In [88]:
print('###\nModos de seleção múltiplas\n###')
display(df[['A', 'D']])

print('###\nModos de seleção múltiplas por etiqueta\n###')
display(df.loc[:, ['A', 'B']])

###
Modos de seleção múltiplas
###


Unnamed: 0,A,D
2024-02-19,-0.080209,0.667153
2024-02-20,-0.152605,-0.317727
2024-02-21,-1.08873,0.052073
2024-02-22,-1.046419,0.674852
2024-02-23,0.615652,2.125909
2024-02-24,-0.059367,0.987279


###
Modos de seleção múltiplas por etiqueta
###


Unnamed: 0,A,B
2024-02-19,-0.080209,-0.319142
2024-02-20,-0.152605,0.236786
2024-02-21,-1.08873,-0.399587
2024-02-22,-1.046419,0.149291
2024-02-23,0.615652,-0.346969
2024-02-24,-0.059367,-0.565885


In [100]:
print('Por linha, pelo número de index')
display(df[0:3])

print('Por linha, pelo nome do index')
display(df['2024-02-19': '2024-02-21'])

print('Por linha, por etiqueta')
display(df.loc['2024-02-19': '2024-02-21', 'A':'E'])

print('Seleção única, dia 19-02-2024 da coluna A')
display(df.loc['2024-02-19', 'A'])

print('Existe também a por posição, df.iloc[:3, :2]')

Por linha, pelo número de index


Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293


Por linha, pelo nome do index


Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293


Por linha, por etiqueta


Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293


Seleção única, dia 19-02-2024 da coluna A


-0.0802091975233623

Existe também a por posição, df.iloc[:3, :2]


In [120]:
print('Por Booleana')
display(df[df['A'] > 0])

print('É possível dar um "fillna("")"')
display(df[df > 0].fillna(''))

print('Com o "isin([4].fillna(""))"')
df2 = df.copy()
df2['F'] = [2, 3, 4, 4, 5, 4]
display(df2[df2.isin([4])].fillna(''))

Por Booleana


Unnamed: 0,A,B,C,D,E
2024-02-23,0.615652,-0.346969,-0.123025,2.125909,0.021837


É possível dar um "fillna("")"


Unnamed: 0,A,B,C,D,E
2024-02-19,,,,0.667153,0.460364
2024-02-20,,0.236786,0.165279,,
2024-02-21,,,,0.052073,
2024-02-22,,0.149291,1.214983,0.674852,1.014127
2024-02-23,0.615652,,,2.125909,0.021837
2024-02-24,,,0.836691,0.987279,2.166143


Com o "isin([4].fillna(""))"


Unnamed: 0,A,B,C,D,E,F
2024-02-19,,,,,,
2024-02-20,,,,,,
2024-02-21,,,,,,4.0
2024-02-22,,,,,,4.0
2024-02-23,,,,,,
2024-02-24,,,,,,4.0


In [129]:
pd.isna(df)

Unnamed: 0,A,B,C,D,E
2024-02-19,False,False,False,False,False
2024-02-20,False,False,False,False,False
2024-02-21,False,False,False,False,False
2024-02-22,False,False,False,False,False
2024-02-23,False,False,False,False,False
2024-02-24,False,False,False,False,False


## Operações

In [134]:
print('Média de cada coluna')
display(df.mean())

print('Média de cada linha')
display(df.mean(axis=1))

print('Média em uma coluna específica, "A"')
display(df['A'].mean())



Média de cada coluna


A   -0.301946
B   -0.207584
C    0.092092
D    0.698257
E    0.295900
dtype: float64

Média de cada linha


2024-02-19    0.111356
2024-02-20   -0.285609
2024-02-21   -0.666706
2024-02-22    0.401367
2024-02-23    0.458681
2024-02-24    0.672972
Freq: D, dtype: float64

Média em uma coluna específica, "A"


-0.30194632038959357

In [143]:
display(df)

display(df.transform(lambda x: abs(x)))

Unnamed: 0,A,B,C,D,E
2024-02-19,-0.080209,-0.319142,-0.171387,0.667153,0.460364
2024-02-20,-0.152605,0.236786,0.165279,-0.317727,-1.35978
2024-02-21,-1.08873,-0.399587,-1.369993,0.052073,-0.527293
2024-02-22,-1.046419,0.149291,1.214983,0.674852,1.014127
2024-02-23,0.615652,-0.346969,-0.123025,2.125909,0.021837
2024-02-24,-0.059367,-0.565885,0.836691,0.987279,2.166143


Unnamed: 0,A,B,C,D,E
2024-02-19,0.080209,0.319142,0.171387,0.667153,0.460364
2024-02-20,0.152605,0.236786,0.165279,0.317727,1.35978
2024-02-21,1.08873,0.399587,1.369993,0.052073,0.527293
2024-02-22,1.046419,0.149291,1.214983,0.674852,1.014127
2024-02-23,0.615652,0.346969,0.123025,2.125909,0.021837
2024-02-24,0.059367,0.565885,0.836691,0.987279,2.166143


In [151]:
display(df["A"].value_counts())

for i in range(len(df)):
    display(df["A"].values[i])

A
-0.080209    1
-0.152605    1
-1.088730    1
-1.046419    1
 0.615652    1
-0.059367    1
Name: count, dtype: int64

-0.0802091975233623

-0.1526048072411058

-1.0887297949113017

-1.0464191207652336

0.6156517259969942

-0.05936672789355238

## Mesclando

In [155]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})

left2 = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right2 = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})

In [156]:
print('Por chaves identicas')
display(pd.merge(left, right, on="key"))

print('Por chaves exclusivas')
display(pd.merge(left2, right2, on="key"))

Por chaves identicas


Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


Por chaves exclusivas


Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5
