# Data Extraction and Preprocessing

In [1]:
# !pip install yfinance

import numpy as np
import pandas as pd
from datetime import datetime
import yfinance as yf

In [19]:
symbols = [
    'AMD', 'AAPL', 'NVDA', 'NIO', 'F', 'AMZN', 'CCL', 'ITUB', 'INTC', 'AMC', 'BAC', 'TWTR', 'ABEV',
    'GOOGL', 'SNAP', 'BBD', 'SWN', 'AAL', 'PLTR', 'C', 'SHOP', 'VALE', 'IBM', 'GOOG', 'WFC', 'TSLA',
    'SHO', 'MSFT', 'DIDIY', 'UBER', 'META', 'CSCO', 'FCX', 'LI', 'T', 'OXY', 'NFLX', 'NOK', 'RBLX',
    'NCLH', 'IQ', 'NLY', 'SOFI', 'KGC', 'BABA', 'FTCH', 'GOLD', 'COIN', 'XOM', 'SIRI', 'MNDT', 'GRAB',
    'DNA', 'KHC', 'HAL', 'PLUG', 'TEVA', 'RIVN', 'DKNG', 'PBR', 'SQ', 'MU', 'IS', 'DAL', 'CLF', 'KEY',
    'NU', 'CMCSA', 'CPNG', 'WBD', 'LYG', 'OPEN', 'VZ', 'AUY', 'PTON', 'BA', 'BEKE', 'MRVL', 'VTRS', 'CS',
    'ET', 'NVAX', 'MRO', 'U', 'JNJ', 'TELL', 'GM', 'AFRM', 'CSX', 'KMI', 'PINS', 'HBAN', 'XPEV', 'M', 'DIS',
    'JPM', 'PYPL', 'BMY', 'CVNA', 'LCID', 'TAL', 'KO', 'X', 'UAA', 'GGB', 'SLB', 'PCG', 'PSTH', 'TSM', 'DVN',
    'UAL', 'HOOD', 'HST', 'BP', 'BTG', 'CPG', 'PFE', 'BSX', 'MS', 'UMC', 'ON', 'PARA', 'QCOM', 'AGNC', 'FITB',
    'CNK', 'DB', 'STLA', 'PBR-A', 'CVX', 'LYFT', 'HTA', 'CVE', 'NKE', 'HR', 'JBLU', 'BKR', 'SYF', 'NCR', 'HPQ',
    'MDLZ', 'ROKU', 'ALLY', 'LUMN', 'SU', 'MRK', 'AVTR', 'AMAT', 'TFC', 'GPS', 'KOS', 'BCS', 'JWN', 'APA', 'RCL',
    'HL', 'VICI', 'TME', 'NEM', 'MO', 'EXC', 'FTNT', 'ORCL', 'KDP', 'ERIC', 'DOW', 'APLS', 'SABR', 'AGI', 'SE',
    'SCHW', 'CVS', 'COP', 'UPST', 'TECK', 'NEE', 'SAN', 'WY', 'GE', 'AMCR', 'LVS', 'AES', 'NKLA', 'TOST',
    'PDD', 'TCOM', 'SBUX', 'PG', 'CHPT', 'STNE', 'CFG', 'TJX', 'CTRA', 'HPE', 'ABT', 'SNOW', 'RUN', 'AEO', 'EPD', 'ATVI',
    'USB', 'WMT', 'ING', 'ABBV', 'GS', 'SO', 'WMB', 'OTLY', 'AA', 'SID', 'PM', 'VIPS', 'WBA', 'TXN', 'BILI', 'FTI',
    'MPW', 'BX', 'MT', 'MET', 'AR', 'HBI', 'CNQ', 'AZN', 'ABNB', 'LUV', 'NLOK', 'RTX', 'RF', '^GSPC', '^DJI', '^IXIC',
    '^NYA', '^XAX', '^BUK100P', '^RUT', '^VIX', '^FTSE', '^GDAXI', '^FCHI', '^STOXX50E', '^N100', '^BFX', 'IMOEX.ME',
    '^N225', '^HSI', '000001.SS', '399001.SZ', '^STI', '^AXJO', '^AORD', '^BSESN', '^JKSE', '^KLSE', '^NZ50', '^KS11',
    '^TWII', '^GSPTSE', '^MXX', '^IPSA', '^MERV', '^TA125.TA', '^CASE30', '^JN0U.JO', 'EWCO', 'IHF', 'VAMO', 'USAI',
    'JHMF', 'JHMH', 'JHMI', 'KBWP', 'IYH', 'KBND', 'FHLC', 'XLV', 'FXO', 'VHT', 'IAI', 'RYH', 'XLF', 'BIZD', 'GAA',
    'VFH', 'RYF', 'XHS', 'DXJS', 'ZROZ', 'FLTR', 'IYG', 'FLRN', 'EMCB', 'IAK', 'FDM', 'WOMN', 'SPVM', 'RAVI', 'ULST',
    'BOCT', 'FLOT', 'SRLN', 'GDMA', 'BSBE', 'VNLA', 'PULS', 'OPER', 'BSDE', 'BFOR', 'BSCE', 'SPXV', 'SPXE', 'FLAX', 'VPC',
    'CHIU', 'VRIG', 'HTAB', 'QARP', 'FUMB', 'SPXN', 'SQLV', 'SMMU', 'ICSH', 'GSY', 'NEAR', 'JPST', 'RDVY', 'XMVM', 'EMHY',
    'BLCN', 'JMST', 'HYEM', 'PKW', 'LSST', 'WTV', 'SLQD', 'FLBL', 'HMOP', 'SPSB', 'TAXF', 'SCHO', 'VGSH', 'LMBS', 'VCSH',
    'XMHQ', 'FPEI', 'FTSD', 'BAB', 'GTO', 'SPTS', 'HAWX', 'FCAL', 'EMTL', 'FSMB', 'FMB', 'EYLD', 'LDUR', 'MMIT', 'MBSD',
    'VGLT', 'CEMB', 'CCOR', 'WINC', 'FMHI', 'FLMI', '^BVSP'
]

data = yf.download(
    tickers=' '.join(symbols),
    period='max',
    interval='1d',
    threads=True
)

[*********************100%***********************]  365 of 365 completed


In [20]:
start_date = '2002-01-03'

data2 = data.loc[start_date:,'Close']
index = pd.date_range(start=data2.index.min(), end=data2.index.max(), freq='D')
data2 = data2.reindex(index)

In [21]:
data3 = data2.copy(deep=True)
data3.interpolate(method='linear', inplace=True)

In [23]:
data3.dropna(axis=1, inplace=True)
data3

Unnamed: 0,AA,AAPL,ABEV,ABT,AEO,AES,AMAT,AMD,AMZN,APA,...,^MERV,^MXX,^N100,^NYA,^RUT,^STI,^TA125.TA,^TWII,^VIX,^XAX
2002-01-03,86.844421,0.421071,0.60,25.089069,8.970000,16.500000,22.745001,19.370001,0.595000,22.623810,...,340.600006,6603.750000,803.770020,6266.310059,495.510010,1653.709961,466.299988,5526.319824,21.340000,842.250000
2002-01-04,89.631897,0.423036,0.60,25.044180,9.893333,17.450001,22.610001,20.000000,0.612500,23.114286,...,343.200012,6612.080078,803.099976,6307.970215,499.299988,1678.670044,466.954994,5638.529785,20.450001,841.440002
2002-01-05,90.320758,0.418334,0.60,24.985823,9.923333,17.580001,22.530001,19.993333,0.614000,23.304763,...,345.353858,6596.533366,800.326660,6295.916829,497.259989,1684.430013,467.610001,5703.983236,20.946667,840.070007
2002-01-06,91.009618,0.413631,0.60,24.927465,9.953333,17.710000,22.450001,19.986666,0.615500,23.495239,...,347.507705,6580.986654,797.553345,6283.863444,495.219991,1690.189982,468.265007,5769.436686,21.443334,838.700012
2002-01-07,91.698479,0.408929,0.60,24.869108,9.983333,17.840000,22.370001,19.980000,0.617000,23.685715,...,349.661551,6565.439941,794.780029,6271.810059,493.179993,1695.949951,468.920013,5834.890137,21.940001,837.330017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-11,52.433333,161.409993,3.02,109.020002,10.976666,27.536667,96.370003,84.909999,135.389999,39.363333,...,144690.130208,47587.967448,1218.986694,15298.383138,1898.343302,3270.796631,2066.100098,14751.427246,23.510001,4273.056641
2022-09-12,52.340000,163.429993,3.03,109.290001,11.120000,27.590000,96.300003,84.639999,136.449997,40.000000,...,144714.093750,47860.480469,1225.790039,15352.179688,1906.089966,3274.719971,2069.760010,14807.429688,23.870001,4278.189941
2022-09-13,50.290001,153.839996,2.94,105.839996,10.560000,26.830000,90.389999,77.029999,126.820000,39.110001,...,142941.593750,47039.738281,1207.589966,14820.790039,1831.569946,3290.080078,2046.069946,14894.410156,27.270000,4273.419922
2022-09-14,44.810001,155.309998,2.92,105.779999,10.790000,27.030001,90.639999,77.449997,128.550003,41.740002,...,146750.000000,46745.148438,1202.890015,14843.209961,1838.459961,3258.020020,2016.599976,14658.309570,26.160000,4401.390137


In [26]:
data3.to_csv('dataset.csv')