<a href="https://colab.research.google.com/github/JoshuaThadi/Data-Science-Notes/blob/main/ExtractStockData(WebScraping).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Stock Data using a Web Scraping

In [1]:
  import pandas as pd
  import requests
  from bs4 import BeautifulSoup

In [2]:
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

## Using Web Scraping to Extract stock data

### Steps to extract stock data

#### step 1 : Send a HTTP requests to the web page

In [3]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/netflix_data_webpage.html"

In [71]:
data = requests.get(url).text

#### step 2. Parse the HTML content of the web page using BeautifulSoup.

In [7]:
soup = BeautifulSoup(data, 'html.parser')

#### step 3 : Identify the HTML tag that contain the data you want to extract

In [10]:
netflix_data = pd.DataFrame(columns = ["Data", "Open", "High", "Low", "Close", "Volume"])
netflix_data.head()

Unnamed: 0,Data,Open,High,Low,Close,Volume


#### Step 4 : Use BeautifulSoup methods to extract data from the HTML tags

In [17]:
# First we isolate the body of the table which contains all the information
# Then we loop through each row and find all the column values for each row

for row in soup.find("tbody").find_all('tr'):
    col = row.find_all("td")
    date = col[0].text
    open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text

    # Finally we append the data of each row to the table
    netflix_data = pd.concat([netflix_data, pd.DataFrame({"Date ": [date], "Open ": [open], "High ": [high], "Low ": [low], "Close ": [close], "Adj_Close ": [adj_close], "Volume ": [volume]})], ignore_index = True)
netflix_data.tail()

Unnamed: 0,Data,Open,High,Low,Close,Volume,Date,Open.1,High.1,Low.1,Close.1,Adj_Close,Volume.1
205,,,,,,,"Jan 01, 2016",109.0,122.18,90.11,91.84,91.84,488193200
206,,,,,,,"Dec 01, 2015",124.47,133.27,113.85,114.38,114.38,319939200
207,,,,,,,"Nov 01, 2015",109.2,126.6,101.86,123.33,123.33,320321800
208,,,,,,,"Oct 01, 2015",102.91,115.83,96.26,108.38,108.38,446204400
209,,,,,,,"Sep 01, 2015",109.35,111.24,93.55,103.26,103.26,497401200


#### Step 5 : Print the Extracted data

#### Extracting data using pandas library

In [15]:
read_html_pandas_data = pd.read_html(url)

# convert into string
read_html_pandas_data = pd.read_html(str(soup))

In [16]:
netflix_dataframe = read_html_pandas_data[0]
netflix_dataframe.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,"Jun 01, 2021",504.01,536.13,482.14,528.21,528.21,78560600
1,"May 01, 2021",512.65,518.95,478.54,502.81,502.81,66927600
2,"Apr 01, 2021",529.93,563.56,499.0,513.47,513.47,111573300
3,"Mar 01, 2021",545.57,556.99,492.85,521.66,521.66,90183900
4,"Feb 01, 2021",536.79,566.65,518.28,538.85,538.85,61902300


In [19]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/amazon_data_webpage.html"

In [72]:
html_data = requests.get(url).text

In [23]:
soup1 = BeautifulSoup(html_data, "html.parser")

In [35]:
# step 1 to extract the title attribute
soup1.find_all('title')

[<title>Amazon.com, Inc. (AMZN) Stock Historical Prices &amp; Data - Yahoo Finance</title>]

In [36]:
# step 2 to extract the title attribute
title = soup1.title.string if soup1.title else "No title found"
print(f"Title of the webpage: {title}")

Title of the webpage: Amazon.com, Inc. (AMZN) Stock Historical Prices & Data - Yahoo Finance


In [52]:
amazon_data = pd.DataFrame(columns=["Date", "Open", "High", "Low", "Close", "Volume"])

for row in soup.find("tbody").find_all("tr"):
    col = row.find_all("td")
    date = col[0].text
    Open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text

    amazon_data = pd.concat([amazon_data, pd.DataFrame({"Date":[date], "Open":[Open], "High":[high], "Low":[low], "Close":[close], "Adj Close":[adj_close], "Volume":[volume]})], ignore_index=True)
amazon_data.head(6)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Jun 01, 2021",504.01,536.13,482.14,528.21,78560600,528.21
1,"May 01, 2021",512.65,518.95,478.54,502.81,66927600,502.81
2,"Apr 01, 2021",529.93,563.56,499.0,513.47,111573300,513.47
3,"Mar 01, 2021",545.57,556.99,492.85,521.66,90183900,521.66
4,"Feb 01, 2021",536.79,566.65,518.28,538.85,61902300,538.85
5,"Jan 01, 2021",539.0,593.29,485.67,532.39,139988600,532.39


In [51]:
print(amazon_data.tail(5))

            Date    Open    High     Low   Close       Volume Adj Close
65  Jan 01, 2016  109.00  122.18   90.11   91.84  488,193,200     91.84
66  Dec 01, 2015  124.47  133.27  113.85  114.38  319,939,200    114.38
67  Nov 01, 2015  109.20  126.60  101.86  123.33  320,321,800    123.33
68  Oct 01, 2015  102.91  115.83   96.26  108.38  446,204,400    108.38
69  Sep 01, 2015  109.35  111.24   93.55  103.26  497,401,200    103.26


In [65]:
print(amazon_data.columns[0:3])

Index(['Date', 'Open', 'High'], dtype='object')


In [68]:
amazon_data.Open.tail()

Unnamed: 0,Open
65,109.0
66,124.47
67,109.2
68,102.91
69,109.35


In [69]:
# Get the 'Open' value from the last row
last_open_value = amazon_data.iloc[-1]['Open']
print(f"The 'Open' value of the last row is: {last_open_value}")


The 'Open' value of the last row is: 109.35


In [70]:
# Using tail() to get the last row and access the 'Open' column
last_open_value = amazon_data.tail(1)['Open'].values[0]
print(f"The 'Open' value of the last row is: {last_open_value}")

The 'Open' value of the last row is: 109.35
