# Pandas

In [9]:
import pandas as pd
from datetime import datetime
import yfinance as yf

In [12]:
dados_acao = yf.download(tickers = ['CASH3.SA', 'VALE3.SA'], start='2015-01-01', end=datetime.now())['Adj Close']
dados_acao

[*********************100%%**********************]  2 of 2 completed


Unnamed: 0_level_0,CASH3.SA,VALE3.SA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02,,12.725255
2015-01-05,,12.533892
2015-01-06,,13.036206
2015-01-07,,13.514600
2015-01-08,,13.658116
...,...,...
2023-10-05,6.59,65.900002
2023-10-06,6.46,66.860001
2023-10-09,6.55,66.379997
2023-10-10,6.73,66.779999


In [13]:
#criando tabela com dados ausentes
dados_teste = pd.DataFrame({
    'empresa': ['Weg', 'Petrobras', 'Vale', pd.NA],
    'cotacao': [20, 50, pd.NA, pd.NA],
    'volume': [3000, pd.NA, pd.NA, pd.NA]
    })
dados_teste

Unnamed: 0,empresa,cotacao,volume
0,Weg,20.0,3000.0
1,Petrobras,50.0,
2,Vale,,
3,,,


In [14]:
#removendo dados ausentes
print(dados_acao.dropna())
#remove todas as linhas que contenham na's
#independente de ter valores em outras colunas

             CASH3.SA   VALE3.SA
Date                            
2020-11-09  15.814895  46.553234
2020-11-10  16.331503  46.531139
2020-11-11  16.414827  46.855286
2020-11-12  15.914884  46.170143
2020-11-13  16.248180  46.597439
...               ...        ...
2023-10-05   6.590000  65.900002
2023-10-06   6.460000  66.860001
2023-10-09   6.550000  66.379997
2023-10-10   6.730000  66.779999
2023-10-11   6.660000  67.360001

[728 rows x 2 columns]


In [21]:
print(dados_teste)

print('\nqualquer na')
print(dados_teste.dropna())

print('\n remove somente as linhas em que TODAS as colunas são NA')
print(dados_teste.dropna(how= 'all'))

print('\n remove somente as linhas as colunas predefinidas são NA')
print(dados_teste.dropna(subset=['empresa', 'cotacao']))

     empresa cotacao volume
0        Weg      20   3000
1  Petrobras      50   <NA>
2       Vale    <NA>   <NA>
3       <NA>    <NA>   <NA>

qualquer na
  empresa cotacao volume
0     Weg      20   3000

 remove somente as linhas em que TODAS as colunas são NA
     empresa cotacao volume
0        Weg      20   3000
1  Petrobras      50   <NA>
2       Vale    <NA>   <NA>

 remove somente as linhas as colunas predefinidas são NA
     empresa cotacao volume
0        Weg      20   3000
1  Petrobras      50   <NA>


In [24]:
#preenchendo dados ausentes
print('preenchendo com 0')
print(dados_teste.fillna(0))

print('\n preenchendo com condicional de tabelas')
print(dados_teste.fillna({'empresa': 'Nenhuma', 'cotacao': 0, 'volume': -10}))

preenchendo com 0
     empresa  cotacao  volume
0        Weg       20    3000
1  Petrobras       50       0
2       Vale        0       0
3          0        0       0

 preenchendo com condicional de tabelas
     empresa  cotacao  volume
0        Weg       20    3000
1  Petrobras       50     -10
2       Vale        0     -10
3    Nenhuma        0     -10


In [25]:
#com series / colunas unicas
cotacoes = pd.Series(
    [20, pd.NA, pd.NA, pd.NA, 23, 24, pd.NA, 25], 
    index = pd.date_range('30/05/2023', freq= 'M', periods= 8)
    )

cotacoes

2023-05-31      20
2023-06-30    <NA>
2023-07-31    <NA>
2023-08-31    <NA>
2023-09-30      23
2023-10-31      24
2023-11-30    <NA>
2023-12-31      25
Freq: M, dtype: object

In [32]:
print('preenchendo repetindo o valor anterior')
print(cotacoes.fillna(method='ffill'))

print('\npreenchendo usando o valor anterior PORÉM com limite de 2 preenchimentos')
print(cotacoes.fillna(method='ffill', limit= 2))

print('\npreenchendo usando o valor médio da série')
print(cotacoes.fillna(cotacoes.mean()))


preenchendo repetindo o valor anterior
2023-05-31    20
2023-06-30    20
2023-07-31    20
2023-08-31    20
2023-09-30    23
2023-10-31    24
2023-11-30    24
2023-12-31    25
Freq: M, dtype: int64

preenchendo usando o valor anterior PORÉM com limite de 2 preenchimentos
2023-05-31      20
2023-06-30      20
2023-07-31      20
2023-08-31    <NA>
2023-09-30      23
2023-10-31      24
2023-11-30      24
2023-12-31      25
Freq: M, dtype: object

preenchendo usando o valor médio da série
2023-05-31    20.0
2023-06-30    23.0
2023-07-31    23.0
2023-08-31    23.0
2023-09-30    23.0
2023-10-31    24.0
2023-11-30    23.0
2023-12-31    25.0
Freq: M, dtype: float64


  print(cotacoes.fillna(method='ffill'))
  print(cotacoes.fillna(method='ffill', limit= 2))


### Desafio

In [None]:
#PETR4.SA LJQQ3.SA CASH3.SA - valor ajus -> 2015 sem NA
#... LJQQ3 sem NA CASH3 com NA

#EUCA3 ffill 2 dias

In [37]:
acoes = ['PETR4.SA', 'LJQQ3.SA', 'CASH3.SA']
desafio = yf.download(tickers = acoes, start='2015-01-01', end=datetime.now())['Adj Close']
desafio.dropna()

[*********************100%%**********************]  3 of 3 completed


Unnamed: 0_level_0,CASH3.SA,LJQQ3.SA,PETR4.SA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-09,15.814895,14.748903,8.259479
2020-11-10,16.331503,14.011458,8.821322
2020-11-11,16.414827,14.650577,8.744880
2020-11-12,15.914884,14.208110,8.374140
2020-11-13,16.248180,14.660409,8.649329
...,...,...,...
2023-10-05,6.590000,3.490000,32.730000
2023-10-06,6.460000,3.460000,33.509998
2023-10-09,6.550000,3.510000,34.950001
2023-10-10,6.730000,3.680000,35.209999


In [38]:
desafio.dropna(subset=['LJQQ3.SA'])

Unnamed: 0_level_0,CASH3.SA,LJQQ3.SA,PETR4.SA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-10,,12.575897,8.962737
2020-08-11,,12.536568,8.821322
2020-08-12,,12.438240,8.974204
2020-08-13,,12.684055,8.729591
2020-08-14,,13.274013,8.664617
...,...,...,...
2023-10-05,6.59,3.490000,32.730000
2023-10-06,6.46,3.460000,33.509998
2023-10-09,6.55,3.510000,34.950001
2023-10-10,6.73,3.680000,35.209999


In [56]:
df = pd.read_excel('dados_euca3.xlsx', na_values='nd')
print(df)
df = df.fillna(method='ffill', limit= 2)
df

         Data  EUCA3
0  2022-09-22    NaN
1  2022-09-21  12.50
2  2022-09-20  12.78
3  2022-09-19  12.78
4  2022-09-16    NaN
..        ...    ...
80 2022-05-31  15.64
81 2022-05-30  15.68
82 2022-05-27  15.77
83 2022-05-26  15.43
84 2022-05-25  15.28

[85 rows x 2 columns]


  df = df.fillna(method='ffill', limit= 2)


Unnamed: 0,Data,EUCA3
0,2022-09-22,
1,2022-09-21,12.50
2,2022-09-20,12.78
3,2022-09-19,12.78
4,2022-09-16,12.78
...,...,...
80,2022-05-31,15.64
81,2022-05-30,15.68
82,2022-05-27,15.77
83,2022-05-26,15.43
