# Data Extraction and Preprocessing

In [1]:
# !pip install yfinance

import numpy as np
import pandas as pd
from datetime import datetime
import yfinance as yf

In [2]:
symbols = [
    '^GSPC', #S&P500 Index
    '^DJI',  #Dow Jones Index
    '^IXIC', #Nasdaq Index
    'CL=F',  #Crude Oil
    'GC=F',  #Gold
    '^BVSP', #IBOV
]

data = yf.download(
    tickers=' '.join(symbols),
    period='max',
    interval='1d',
    auto_adjust = True,
    threads=True,
    group_by='ticker'
)

[*********************100%***********************]  6 of 6 completed


In [3]:
data = data.loc['2002-01-01':,:]
data

Unnamed: 0_level_0,CL=F,CL=F,CL=F,CL=F,CL=F,^BVSP,^BVSP,^BVSP,^BVSP,^BVSP,...,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC,^IXIC,^IXIC,^IXIC,^IXIC,^IXIC
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2002-01-02,20.000000,21.049999,19.850000,21.010000,69560.0,13586.0,13902.0,13571.0,13872.0,0.0,...,1148.079956,1154.670044,1136.229980,1154.670044,1.171000e+09,1965.180054,1979.260010,1936.560059,1979.250000,1.517670e+09
2002-01-03,20.780001,21.100000,20.150000,20.370001,64010.0,14125.0,14286.0,14121.0,14265.0,0.0,...,1154.670044,1165.270020,1154.010010,1165.270020,1.398900e+09,1987.060059,2044.560059,1987.060059,2044.270020,2.209630e+09
2002-01-04,20.850000,21.700001,20.400000,21.620001,84832.0,14250.0,14358.0,14103.0,14332.0,0.0,...,1165.270020,1176.550049,1163.420044,1172.510010,1.513000e+09,2061.830078,2077.889893,2033.560059,2059.379883,2.205610e+09
2002-01-07,21.500000,22.000000,21.200001,21.480000,64080.0,14327.0,14413.0,14273.0,14379.0,0.0,...,1172.510010,1176.969971,1163.550049,1164.890015,1.308300e+09,2075.239990,2081.090088,2036.859985,2037.099976,2.121110e+09
2002-01-08,21.200001,21.500000,21.000000,21.250000,59541.0,14385.0,14385.0,14099.0,14168.0,0.0,...,1164.890015,1167.599976,1157.459961,1160.709961,1.258800e+09,2039.420044,2060.229980,2027.339966,2055.739990,1.873670e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-12,86.250000,89.099998,85.160004,87.779999,275104.0,112307.0,114160.0,112305.0,113407.0,11707100.0,...,4083.669922,4119.279785,4083.669922,4110.410156,3.814200e+09,12174.940430,12270.190430,12169.280273,12266.410156,4.146680e+09
2022-09-13,88.089996,89.309998,85.059998,87.309998,347998.0,113398.0,113400.0,110522.0,110794.0,12612500.0,...,4037.120117,4037.120117,3921.280029,3932.689941,4.224550e+09,11908.809570,11957.969727,11604.429688,11633.570312,5.188380e+09
2022-09-14,87.940002,90.190002,86.180000,88.480003,335381.0,110794.0,111504.0,110118.0,110547.0,11630900.0,...,3940.729980,3961.939941,3912.179932,3946.010010,4.293240e+09,11680.410156,11746.830078,11602.759766,11719.679688,4.861530e+09
2022-09-15,88.910004,89.150002,84.529999,85.099998,235358.0,110547.0,111100.0,109524.0,109954.0,11622500.0,...,3932.409912,3959.139893,3888.280029,3901.350098,4.441830e+09,11633.240234,11760.730469,11497.110352,11552.360352,4.805910e+09


In [4]:
data2 = data.copy(deep=True)
index = pd.date_range(start=data2.index.min(), end=data2.index.max(), freq='D')
data2 = data2.reindex(index)
data2 = data2.interpolate(method='linear')

In [5]:
(data2.isnull().sum() > 0).sum()

0

In [6]:
data3 = data2.copy(deep=True)
data3.rename(columns={
   '^GSPC': 'S&P500',
    '^DJI': 'Dow_Jones',
    '^IXIC': 'Nasdaq',
    'CL=F': 'Crude_Oil',
    'GC=F': 'Gold',
    '^BVSP': 'IBOV'
}, inplace=True)

data3.columns = list(map(lambda x : f'{x[0]}_{x[1]}', data3.columns.to_flat_index()))
data3 = data3[[
    'IBOV_Open',
    'IBOV_High',
    'IBOV_Low',
    'IBOV_Close',
    'IBOV_Volume',
    'Crude_Oil_Close',
    'Gold_Close',
    'Nasdaq_Close',
    'Dow_Jones_Close',
    'S&P500_Close'
]]

In [7]:
data3.to_csv('dataset2.csv')