# Scraping Stock Data

In [1]:
import pandas as pd
import io
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf
import os

In [2]:
# get stock information to determine which sector it belongs to
aapl = yf.Ticker("AAPL")
aapl.info

{'zip': '95014',
 'sector': 'Technology',
 'fullTimeEmployees': 164000,
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. It also sells various related services. In addition, the company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. Further, it provides AppleCare support and cloud services store services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts. Additionally, the company offers various services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, which offers users a curated listening experience with on-demand rad

In [3]:
# Technology 10
technology_tickers = ["AAPL","MSFT","NVDA","TSM","ORCL","ASML","AVGO","CSCO","ACN","IBM"]
technology_dict = {}
for t in technology_tickers:
    technology_dict[t] = yf.download(tickers=t, period='5y')
    technology_dict[t].drop(columns=["Adj Close"], inplace = True)
    technology_dict[t].insert(0, "Company",t)
    technology_dict[t].insert(1, "Sector", "Technology")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [4]:
technology_dict["AAPL"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,AAPL,Technology,42.759998,42.847500,42.410000,42.537498,87598000
2017-11-20,AAPL,Technology,42.572498,42.639999,42.389999,42.494999,65049600
2017-11-21,AAPL,Technology,42.695000,43.424999,42.695000,43.285000,100525200
2017-11-22,AAPL,Technology,43.340000,43.750000,43.262501,43.740002,102355600
2017-11-24,AAPL,Technology,43.775002,43.875000,43.662498,43.742500,56106800
...,...,...,...,...,...,...,...
2022-11-10,AAPL,Technology,141.240005,146.869995,139.500000,146.869995,118854000
2022-11-11,AAPL,Technology,145.820007,150.009995,144.369995,149.699997,93903800
2022-11-14,AAPL,Technology,148.970001,150.279999,147.429993,148.279999,73374100
2022-11-15,AAPL,Technology,152.220001,153.589996,148.559998,150.039993,89868300


In [5]:
# Healthcare 10
healthcare_tickers = ["UNH","JNJ","LLY","PFE","ABBV","MRK","NVO","TMO","DHR","AZN"]
healthcare_dict = {}
for t in healthcare_tickers:
    healthcare_dict[t] = yf.download(tickers=t, period='5y')
    healthcare_dict[t].drop(columns=["Adj Close"], inplace = True)
    healthcare_dict[t].insert(0, "Company",t)
    healthcare_dict[t].insert(1, "Sector", "Healthcare")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [6]:
healthcare_dict["UNH"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,UNH,Healthcare,210.460007,211.460007,209.149994,209.899994,1887500
2017-11-20,UNH,Healthcare,210.000000,210.679993,209.580002,210.250000,2040400
2017-11-21,UNH,Healthcare,211.130005,212.850006,210.750000,212.600006,2327200
2017-11-22,UNH,Healthcare,212.259995,212.809998,210.759995,211.220001,2257800
2017-11-24,UNH,Healthcare,212.000000,212.929993,211.000000,212.509995,744000
...,...,...,...,...,...,...,...
2022-11-10,UNH,Healthcare,548.929993,551.700012,533.950012,544.169983,3426400
2022-11-11,UNH,Healthcare,546.280029,546.280029,509.649994,522.080017,7032900
2022-11-14,UNH,Healthcare,522.000000,528.979980,513.409973,513.750000,5238400
2022-11-15,UNH,Healthcare,512.809998,516.640015,500.769989,503.010010,5103800


In [7]:
# Consumer Cyclical 12
cyclical_tickers = ["AMZN","TSLA","HD","MCD","TM","BABA","NKE","LOW","SBUX","ABNB","LULU","EBAY"]
cyclical_dict = {}
for t in cyclical_tickers:
    cyclical_dict[t] = yf.download(tickers=t, period='5y')
    cyclical_dict[t].drop(columns=["Adj Close"], inplace = True)
    cyclical_dict[t].insert(0, "Company",t)
    cyclical_dict[t].insert(1, "Sector", "Consumer Cyclical")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [8]:
cyclical_dict["AMZN"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,AMZN,Consumer Cyclical,56.914001,56.939999,56.290501,56.493999,48268000
2017-11-20,AMZN,Consumer Cyclical,56.488499,56.671001,56.127499,56.315498,43278000
2017-11-21,AMZN,Consumer Cyclical,56.643002,57.000000,56.410000,56.974499,49588000
2017-11-22,AMZN,Consumer Cyclical,57.049999,58.013500,57.049999,57.807999,71106000
2017-11-24,AMZN,Consumer Cyclical,58.035000,59.341999,58.035000,59.299999,70560000
...,...,...,...,...,...,...,...
2022-11-10,AMZN,Consumer Cyclical,92.940002,98.690002,91.650002,96.629997,173414900
2022-11-11,AMZN,Consumer Cyclical,97.879997,101.190002,96.660004,100.790001,111481700
2022-11-14,AMZN,Consumer Cyclical,98.769997,100.120003,97.290001,98.489998,99533100
2022-11-15,AMZN,Consumer Cyclical,103.209999,103.790001,97.339996,98.940002,111336300


In [9]:
# Industrials 2
industrials_tickers = ["RTX","BA"]
industrials_dict = {}
for t in industrials_tickers:
    industrials_dict[t] = yf.download(tickers=t, period='5y')
    industrials_dict[t].drop(columns=["Adj Close"], inplace = True)
    industrials_dict[t].insert(0, "Company",t)
    industrials_dict[t].insert(1, "Sector", "Industrials")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [10]:
industrials_dict["RTX"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,RTX,Industrials,73.939583,74.027687,73.303963,73.335434,4463024.0
2017-11-20,RTX,Industrials,73.524231,73.744492,73.190689,73.241035,4631776.0
2017-11-21,RTX,Industrials,73.297668,74.027687,73.297668,73.656387,3961377.0
2017-11-22,RTX,Industrials,73.631218,73.750786,73.348015,73.461296,4545493.0
2017-11-24,RTX,Industrials,73.631218,73.731911,73.222153,73.574577,2361890.0
...,...,...,...,...,...,...,...
2022-11-10,RTX,Industrials,96.900002,98.019997,95.699997,97.839996,4764100.0
2022-11-11,RTX,Industrials,97.139999,97.199997,92.779999,93.650002,6936100.0
2022-11-14,RTX,Industrials,93.650002,94.540001,92.919998,92.949997,4942400.0
2022-11-15,RTX,Industrials,93.620003,95.949997,93.339996,95.790001,7506500.0


In [11]:
# Financial Services 8
financial_tickers = ["JPM","BAC","GS","MS","V","HSBC","C","UBS"]
financial_dict = {}
for t in financial_tickers:
    financial_dict[t] = yf.download(tickers=t, period='5y')
    financial_dict[t].drop(columns=["Adj Close"], inplace = True)
    financial_dict[t].insert(0, "Company",t)
    financial_dict[t].insert(1, "Sector", "Financial Services")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [12]:
financial_dict["C"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,C,Financial Services,71.160004,71.830002,70.940002,71.330002,11813000
2017-11-20,C,Financial Services,71.769997,72.300003,71.279999,72.050003,10000000
2017-11-21,C,Financial Services,72.500000,72.589996,72.129997,72.379997,9632100
2017-11-22,C,Financial Services,72.449997,73.150002,72.260002,72.260002,9146200
2017-11-24,C,Financial Services,72.410004,72.519997,71.989998,72.019997,4676100
...,...,...,...,...,...,...,...
2022-11-10,C,Financial Services,46.709999,48.759998,46.560001,48.419998,27915500
2022-11-11,C,Financial Services,48.500000,50.570000,48.439999,50.189999,26003700
2022-11-14,C,Financial Services,49.939999,49.980000,49.020000,49.020000,19631600
2022-11-15,C,Financial Services,49.830002,50.419998,48.490002,49.029999,19839400


In [13]:
# Communication Services 5
communication_tickers = ["GOOG","META","NFLX","BIDU","DIS"]
communication_dict = {}
for t in communication_tickers:
    communication_dict[t] = yf.download(tickers=t, period='5y')
    communication_dict[t].drop(columns=["Adj Close"], inplace = True)
    communication_dict[t].insert(0, "Company",t)
    communication_dict[t].insert(1, "Sector", "Communication Services")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [14]:
communication_dict["GOOG"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,GOOG,Communication Services,51.700500,51.721001,50.887501,50.954498,27942000
2017-11-20,GOOG,Communication Services,51.013000,51.130501,50.875000,50.918999,19070000
2017-11-21,GOOG,Communication Services,51.165501,51.755501,51.132751,51.724499,21940000
2017-11-22,GOOG,Communication Services,51.750000,51.985298,51.571499,51.798000,14926000
2017-11-24,GOOG,Communication Services,51.793499,52.158901,51.750000,52.030499,10740000
...,...,...,...,...,...,...,...
2022-11-10,GOOG,Communication Services,92.339996,94.550003,91.650002,94.169998,42371200
2022-11-11,GOOG,Communication Services,94.709999,97.360001,94.160004,96.730003,30536500
2022-11-14,GOOG,Communication Services,95.500000,97.180000,95.112999,96.029999,24170100
2022-11-15,GOOG,Communication Services,98.669998,100.419998,97.019997,98.720001,31831000


In [15]:
# Energy 4
energy_tickers = ["XOM","CVX","SHEL","COP"]
energy_dict = {}
for t in energy_tickers:
    energy_dict[t] = yf.download(tickers=t, period='5y')
    energy_dict[t].drop(columns=["Adj Close"], inplace = True)
    energy_dict[t].insert(0, "Company",t)
    energy_dict[t].insert(1, "Sector", "Energy")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [16]:
energy_dict["SHEL"]

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,SHEL,Energy,61.860001,62.070000,61.599998,61.900002,2507582.0
2017-11-20,SHEL,Energy,62.099998,62.139999,61.660000,61.720001,2702517.0
2017-11-21,SHEL,Energy,62.189999,62.285000,61.455002,61.480000,3359036.0
2017-11-22,SHEL,Energy,62.080002,62.340000,61.994999,62.150002,3215854.0
2017-11-24,SHEL,Energy,62.389999,62.650002,62.360001,62.439999,2044686.0
...,...,...,...,...,...,...,...
2022-11-10,SHEL,Energy,54.490002,54.740002,53.560001,53.880001,5155983.0
2022-11-11,SHEL,Energy,55.590000,55.994999,55.174999,55.770000,5480342.0
2022-11-14,SHEL,Energy,54.970001,55.410000,54.660000,54.660000,3393581.0
2022-11-15,SHEL,Energy,56.270000,56.669998,55.950001,56.279999,4948655.0


In [18]:
data = pd.concat(list(technology_dict.values())
          +list(healthcare_dict.values())
          +list(cyclical_dict.values())
          +list(industrials_dict.values())
          +list(financial_dict.values())
          +list(communication_dict.values())
          +list(energy_dict.values()))

In [19]:
data

Unnamed: 0_level_0,Company,Sector,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-17,AAPL,Technology,42.759998,42.847500,42.410000,42.537498,87598000.0
2017-11-20,AAPL,Technology,42.572498,42.639999,42.389999,42.494999,65049600.0
2017-11-21,AAPL,Technology,42.695000,43.424999,42.695000,43.285000,100525200.0
2017-11-22,AAPL,Technology,43.340000,43.750000,43.262501,43.740002,102355600.0
2017-11-24,AAPL,Technology,43.775002,43.875000,43.662498,43.742500,56106800.0
...,...,...,...,...,...,...,...
2022-11-10,COP,Energy,128.100006,129.770004,125.570000,129.460007,7543900.0
2022-11-11,COP,Energy,131.199997,134.850006,130.559998,133.960007,8660300.0
2022-11-14,COP,Energy,132.380005,135.679993,132.279999,133.029999,6163400.0
2022-11-15,COP,Energy,133.589996,134.839996,131.910004,134.089996,8300600.0


In [20]:
data.isnull().sum()

Company    0
Sector     0
Open       0
High       0
Low        0
Close      0
Volume     0
dtype: int64

In [22]:
filepath = "../../data/stocks.csv"
if not os.path.exists(filepath):
    data.to_csv("../../data/stocks.csv")