In [22]:
# Extracting data using BeautifulSoup

# Download the web page Using Requests Library
# Parse HTML on a web page using BeautifulSoup
# Extract data and duild a data frame
# Extracting data using pandas

In [None]:
!pip install pandas
!pip install requests
!pip install bs4
!pip install html5lib
!pip install lxml
!pip install plotly

In [46]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
 # Set the display options in Jupyter Notebook
pd.options.display.max_rows = 10  # Set to desired number

In [47]:


import warnings
# Ignore all warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [48]:
# We will extract Netflix stock data using yahoo finance website

# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/netflix_data_webpage.html



In [27]:
# On the following web page we have a table with columns name (Date, Open, High, Low, close, adj close volume) 
# out of which we must extract following columns -->

# Date

# Open

# High

# Low

# Close

# Volume

In [28]:
# We will need to :
# Send an HTTP request to the web page using the requests library.---> 1
# Parse the HTML content of the web page using BeautifulSoup.     -->   2
# Identify the HTML tags that contain the data you want to extract.- ->  3
# Use BeautifulSoup methods to extract the data from the HTML tags. _->    4 
# Print the extracted data                                ->                 5
 

# will use the request library for sending an HTTP request to the web page.

# url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/netflix_data_webpage.html"

In [None]:
# The requests.get() method takes a URL as its first argument, which specifies the location of the resource to be retrieved. 
# In this case, the value of the url variable is passed as the argument to the requests.get() method, 
# because you will store a web page URL in a url variable.

# You use the .text method for extracting the HTML content as a string in order to make it readable.


url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/netflix_data_webpage.html"

data = requests.get(url).text
print(data)



# Send an HTTP request to the web page using the requests library.---> Step 1 done 

In [50]:
# Now we will need to parse the HTML content of the url
# Take the raw HTML content of a web page or a string of HTML code which needs to be parsed and transformed into a structured, 
# hierarchical format that can be more easily analyzed and manipulated in Python. 
# We will achieve this using a Python library called Beautiful Soup.


# Must create a new BeautifulSoup object, we will need to pass two arguments to its constructor
# 1. The HTML or XML Content that I want to parse into a string
# 2. The name of the parser that I want to use to parse the HTML or XML content. optional because Soup has a default HTML parser but I will use HTML5lib

soup = BeautifulSoup(data,'html5lib')


print('finished parsing successfully')


# Parse the HTML content of the web page using BeautifulSoup.     -->   Step 2 Completed


finished parsing successfully


In [51]:
# Step 3 --> now we need to identify the HTML tags

# We will scrape the content of the web page and convert the table into a data frame
# Will create an empty data frame using the pd.DataFrame() func with the specified columns -->


# "Date"
# "Open"
# "High"
# "Low"
# "Close"
# "Volume"

netflix_data = pd.DataFrame(columns=["Date","Open","High","Low","Close","Volume"])


print(netflix_data)
netflix_data

Empty DataFrame
Columns: [Date, Open, High, Low, Close, Volume]
Index: []


Unnamed: 0,Date,Open,High,Low,Close,Volume


In [32]:
# <table>: This tag is a root tag used to define the start and end of the table. All the content of the table is enclosed within these tags.

# <tr>: This tag is used to define a table row. Each row of the table is defined within this tag.

# <td>: This tag is used to define a table cell. Each cell of the table is defined within this tag. You can specify the content of the cell between the opening and closing tags.

# <th>: This tag is used to define a header cell in the table. The header cell is used to describe the contents of a column or row. By default, the text inside a tag is bold and centered.

# <tbody>: This is the main content of the table, which is defined using the tag. It contains one or more rows of elements.



In [52]:
# Step 4 ---> Use a BeautifulSoup method to extract the data
# We will use find() and find_all() methods of the BeautifulSoup object to locate the table body and table row respectively in the HTML.

# The find() method will return particular tag content.
# The find_all() method returns a list of all matching tags in the HTML.


# First we isolate the body of the table which contains all the information
# Then we loop through each row and find all the column values for each row
for row in soup.find("tbody").find_all('tr'):
    col = row.find_all("td")
    date = col[0].text
    Open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text
    
    # Finally we append the data of each row to the table
netflix_data = pd.concat([netflix_data,pd.DataFrame({"Date":[date], "Open":[Open], "High":[high], "Low":[low], "Close":[close], "Adj Close":[adj_close], "Volume":[volume]})], ignore_index=True)  


In [53]:
# Boom now it's stored in appropriate df we want

netflix_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Sep 01, 2015",109.35,111.24,93.55,103.26,497401200,103.26


In [54]:
# Step 5 ----> Print the extracted data

netflix_data.head()  

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Sep 01, 2015",109.35,111.24,93.55,103.26,497401200,103.26


In [None]:
# Use the request library to download webpage of Amazon stock data, save text of the response as a variable named html_data

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/amazon_data_webpage.html"

amazon_data = requests.get(url).text

print(amazon_data)

In [56]:
# Parse the html data using beautiful_soup, we will pass our argument html_data, then define the parser we want to use.


soup = BeautifulSoup(amazon_data,'html5lib')   # we parse and store that parsed data in soup,
# to then pass into soup func to find title_tag etc


print('finished parsing successfully')


finished parsing successfully


In [57]:
# 1: What is the content of the title attribute?
# ----> We will need to identify the 


# Extract the content of the title attribute
title_tag = soup.title # Soup library automatically finds the title attr, just need to call it and assign
title_content = title_tag.string if title_tag else 'No title tag found'  # Conditional check to ensure if N/A to return N/A

# Print the content of the title tag
print("Title content:", title_content)

Title content: Amazon.com, Inc. (AMZN) Stock Historical Prices & Data - Yahoo Finance


In [70]:
# Using BeautifulSoup, extract the table with historical share prices and store it into a data frame named amazon_data. 
# The data frame should have columns -->  Date, Open, High, Low, Close, Adj Close, and Volume. 
# Fill in each variable with the correct data from the list col.

amazon_data = pd.DataFrame(columns=["Date", "Open", "High", "Low", "Close", "Volume"])

for row in soup.find("tbody").find_all("tr"):
    col = row.find_all("td")
    date = col[0].text
    Open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text
    
    amazon_data = pd.concat([amazon_data, pd.DataFrame({"Date":[date], "Open":[Open], "High":[high], "Low":[low], "Close":[close], "Adj Close":[adj_close], "Volume":[volume]})], ignore_index=True)








In [59]:
# Printing out the first five rows of the dataframe we created

amazon_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Jan 01, 2021",3270.0,3363.89,3086.0,3206.2,71528900,3206.2
1,"Dec 01, 2020",3188.5,3350.65,3072.82,3256.93,77556200,3256.93
2,"Nov 01, 2020",3061.74,3366.8,2950.12,3168.04,90810500,3168.04
3,"Oct 01, 2020",3208.0,3496.24,3019.0,3036.15,116226100,3036.15
4,"Sep 01, 2020",3489.58,3552.25,2871.0,3148.73,115899300,3148.73


In [64]:
# What are the names of the columns in the data frame?
# We could just do amazon_data.head()
# A better way would be like this ---->

column_names = amazon_data.columns
print(column_names)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], dtype='object')


In [65]:
# print the shape
print(amazon_data.shape)  # Rows, Columns


(61, 7)


In [43]:
print(amazon_data)

            Date      Open      High       Low     Close       Volume  \
0   Jan 01, 2021  3,270.00  3,363.89  3,086.00  3,206.20   71,528,900   
1   Dec 01, 2020  3,188.50  3,350.65  3,072.82  3,256.93   77,556,200   
2   Nov 01, 2020  3,061.74  3,366.80  2,950.12  3,168.04   90,810,500   
3   Oct 01, 2020  3,208.00  3,496.24  3,019.00  3,036.15  116,226,100   
4   Sep 01, 2020  3,489.58  3,552.25  2,871.00  3,148.73  115,899,300   
..           ...       ...       ...       ...       ...          ...   
56  May 01, 2016    663.92    724.23    656.00    722.79   90,614,500   
57  Apr 01, 2016    590.49    669.98    585.25    659.59   78,464,200   
58  Mar 01, 2016    556.29    603.24    538.58    593.64   94,009,500   
59  Feb 01, 2016    578.15    581.80    474.00    552.52  124,144,800   
60  Jan 01, 2016    656.29    657.72    547.18    587.00  130,200,900   

   Adj Close  
0   3,206.20  
1   3,256.93  
2   3,168.04  
3   3,036.15  
4   3,148.73  
..       ...  
56    722.79  
57 

In [66]:
# What is the Open of the last row of the amazon_data data frame?

# Access the last row's 'Open' value using iloc
last_row_open = amazon_data.iloc[-1]['Open']

# Print the value
print("Open value of the last row =", last_row_open)


Open value of the last row = 656.29


In [67]:
amazon_data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
56,"May 01, 2016",663.92,724.23,656.0,722.79,90614500,722.79
57,"Apr 01, 2016",590.49,669.98,585.25,659.59,78464200,659.59
58,"Mar 01, 2016",556.29,603.24,538.58,593.64,94009500,593.64
59,"Feb 01, 2016",578.15,581.8,474.0,552.52,124144800,552.52
60,"Jan 01, 2016",656.29,657.72,547.18,587.0,130200900,587.0
