In [1]:
# imports
# hides all warnings
import warnings
warnings.filterwarnings('ignore')
# http requests & error
import requests
from requests.exceptions import HTTPError
# beautiful soup
from bs4 import BeautifulSoup

In [2]:
# specify the url
print('\n*** URL Request ***')
#vURL = "http://maexadata.in/sample"
vURL = "https://maexadata.firstcloudit.com/sample.html"
# packages the request, send the request and catch the response
try:
    response = requests.get(vURL)
    # If the response was successful, no Exception will be raised
    response.raise_for_status()
except HTTPError as http_err:
    print(f'HTTP Error: {http_err}')
except Exception as err:
    print(f'General Error occurred: {err}')
else:
    print('HTTP Read Success!')


*** URL Request ***
HTTP Read Success!


In [3]:
# print response
print('\n*** URL Response Status Code ***')
print(response.status_code)


*** URL Response Status Code ***
200


In [4]:
# proceed if status code = 200
if response.status_code!=200:
    print("Stop Program Execution")
    exit(1)

In [5]:
 #print response text
print('\n*** Request Text ***')
print(response.text)


*** Request Text ***
<!DOCTYPE html>
<html>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<body>
<Title>Sample File</Title>

<header class="entry-header">
<h1 class="entry-title">Sample</h1>
</header>

<div class="entry-content">
<h2>This is a sample page</h2>
<ol>
<li>Ordered List 1</li>
<li>Ordered List 1</li>
<li>Ordered List 1</li>
</ol>
<ul>
<li>Un-ordered List 1</li>
<li>Un-ordered List 1</li>
<li>Un-ordered List 1</li>
</ul>
<p>At&nbsp;<em>maexadata</em>, we make sense of the mega-exabytes-of-data!!! We use cutting data analytics technology &amp; tools to process this omni-generating data and provide you with information. Our goal is turn your data to usable information to insights and then finally to intelligence; be it Business Intelligence,&nbsp;Customer Intelligence,&nbsp;Supply Chain Intelligence,&nbsp;Sentiment Analysis, Click-stream Analytics, Cloud Analytics, IoT Analytics, Customer Decision Management, Fraud &amp; Security In

In [6]:
# extract the response: text
print('\n*** HTML Text ***')
htmldoc = response.text
print(htmldoc)
print(type(htmldoc))



*** HTML Text ***
<!DOCTYPE html>
<html>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<body>
<Title>Sample File</Title>

<header class="entry-header">
<h1 class="entry-title">Sample</h1>
</header>

<div class="entry-content">
<h2>This is a sample page</h2>
<ol>
<li>Ordered List 1</li>
<li>Ordered List 1</li>
<li>Ordered List 1</li>
</ol>
<ul>
<li>Un-ordered List 1</li>
<li>Un-ordered List 1</li>
<li>Un-ordered List 1</li>
</ul>
<p>At&nbsp;<em>maexadata</em>, we make sense of the mega-exabytes-of-data!!! We use cutting data analytics technology &amp; tools to process this omni-generating data and provide you with information. Our goal is turn your data to usable information to insights and then finally to intelligence; be it Business Intelligence,&nbsp;Customer Intelligence,&nbsp;Supply Chain Intelligence,&nbsp;Sentiment Analysis, Click-stream Analytics, Cloud Analytics, IoT Analytics, Customer Decision Management, Fraud &amp; Security Intel

In [7]:
# create a BeautifulSoup object from the HTML
print('\n*** Soup Object ***')
#soup = BeautifulSoup(htmldoc, 'html.parser')
soup = BeautifulSoup(htmldoc, 'lxml')
print(soup)


*** Soup Object ***
<!DOCTYPE html>
<html>
<head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
</head><body>
<title>Sample File</title>
<header class="entry-header">
<h1 class="entry-title">Sample</h1>
</header>
<div class="entry-content">
<h2>This is a sample page</h2>
<ol>
<li>Ordered List 1</li>
<li>Ordered List 1</li>
<li>Ordered List 1</li>
</ol>
<ul>
<li>Un-ordered List 1</li>
<li>Un-ordered List 1</li>
<li>Un-ordered List 1</li>
</ul>
<p>At <em>maexadata</em>, we make sense of the mega-exabytes-of-data!!! We use cutting data analytics technology &amp; tools to process this omni-generating data and provide you with information. Our goal is turn your data to usable information to insights and then finally to intelligence; be it Business Intelligence, Customer Intelligence, Supply Chain Intelligence, Sentiment Analysis, Click-stream Analytics, Cloud Analytics, IoT Analytics, Customer Decision Management, Fraud &amp; Security Intelligence, Risk Management, Su

In [8]:
# prettify the BeautifulSoup object: pretty_soup
print('\n*** Soup Prettify ***')
pretty_soup = soup.prettify()
# print the response
print(pretty_soup)


*** Soup Prettify ***
<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
 </head>
 <body>
  <title>
   Sample File
  </title>
  <header class="entry-header">
   <h1 class="entry-title">
    Sample
   </h1>
  </header>
  <div class="entry-content">
   <h2>
    This is a sample page
   </h2>
   <ol>
    <li>
     Ordered List 1
    </li>
    <li>
     Ordered List 1
    </li>
    <li>
     Ordered List 1
    </li>
   </ol>
   <ul>
    <li>
     Un-ordered List 1
    </li>
    <li>
     Un-ordered List 1
    </li>
    <li>
     Un-ordered List 1
    </li>
   </ul>
   <p>
    At
    <em>
     maexadata
    </em>
    , we make sense of the mega-exabytes-of-data!!! We use cutting data analytics technology &amp; tools to process this omni-generating data and provide you with information. Our goal is turn your data to usable information to insights and then finally to intelligence; be it Business Intelligence, Customer Intelligence, Supply Ch

In [9]:
# get the title of the webpage
print('\n*** Page Title ***')
page_title = soup.title.get_text()
# print the title of webpage to the shell
print(page_title)


*** Page Title ***
Sample File


In [10]:
# get page text
print('\n*** Page Full Text ***')
page_text = soup.get_text()
# print page text 
print(page_text)



*** Page Full Text ***



Sample File

Sample


This is a sample page

Ordered List 1
Ordered List 1
Ordered List 1


Un-ordered List 1
Un-ordered List 1
Un-ordered List 1

At maexadata, we make sense of the mega-exabytes-of-data!!! We use cutting data analytics technology & tools to process this omni-generating data and provide you with information. Our goal is turn your data to usable information to insights and then finally to intelligence; be it Business Intelligence, Customer Intelligence, Supply Chain Intelligence, Sentiment Analysis, Click-stream Analytics, Cloud Analytics, IoT Analytics, Customer Decision Management, Fraud & Security Intelligence, Risk Management, Sustainability Management. Have developed various Quantitative, Qualitative, Linear Regression, Logistic Regression, Classification, Clustering, Moving Averages, Simulation, Decision Trees and Machine Learning models and various Hypothesis Testing.
Cyrus the Great
Cyrus II of Persia, commonly known as Cyrus the Great

In [11]:
# <p> tag
# find all 'p' tags (which define paragraphs: <p>
print('\n*** Page <p> Text ***')
p_tags = soup.find_all('p')
# print the para text
for para in p_tags:
    print(para.text)
    print("")


*** Page <p> Text ***
At maexadata, we make sense of the mega-exabytes-of-data!!! We use cutting data analytics technology & tools to process this omni-generating data and provide you with information. Our goal is turn your data to usable information to insights and then finally to intelligence; be it Business Intelligence, Customer Intelligence, Supply Chain Intelligence, Sentiment Analysis, Click-stream Analytics, Cloud Analytics, IoT Analytics, Customer Decision Management, Fraud & Security Intelligence, Risk Management, Sustainability Management. Have developed various Quantitative, Qualitative, Linear Regression, Logistic Regression, Classification, Clustering, Moving Averages, Simulation, Decision Trees and Machine Learning models and various Hypothesis Testing.

Cyrus II of Persia, commonly known as Cyrus the Great, and also called Cyrus the Elder by the Greeks, was the founder of the Achaemenid Empire, the first Persian Empire. Wikipedia

Born: 600 BC, Anshan, IranDied: 4 Dece

In [12]:
#  <li> tag
print('\n*** Page <li> Text ***')
li_tags = soup.find_all('li')
# print the para text
for li in li_tags:
    print(li.text)


*** Page <li> Text ***
Ordered List 1
Ordered List 1
Ordered List 1
Un-ordered List 1
Un-ordered List 1
Un-ordered List 1


In [13]:
 # <ol> tag
print('\n*** Page <li> Text ***')
ol_tags = soup.find_all('ol')
# print the para text
for li in ol_tags:
    print(li.text)


*** Page <li> Text ***

Ordered List 1
Ordered List 1
Ordered List 1



In [14]:
# <ul> tag
print('\n*** Page <li> Text ***')
ul_tags = soup.find_all('ul')
# print the para text
for li in ul_tags:
    print(li.text)


*** Page <li> Text ***

Un-ordered List 1
Un-ordered List 1
Un-ordered List 1



In [15]:
# <table> tag    
# find all 'table' tags (which define tabel: <table>
print('\n*** Page <table> Text ***')
t_tags = soup.find_all('table')
# print the table text
for table in t_tags:
    print(table.text)
    print("")


*** Page <table> Text ***


Firstname
Lastname
Age


Cyrus
Lentin
50


Vipul
Patel
52


Abbas
Contractor
54






Item Description
Jul-15
Aug-15
Sep-15
Oct-15


Basic Goods
5.159705
5.137615
6.223818
2.638037


Basic metals
6.812879
10.89866
9.526044
7.349455


Capital Goods
10.56231
-3.62024
2.192982
5.531915


Consumer Durables
11.36261
15.99811
-3.92082
1.292091


Consumer Goods
1.284297
6.590773
-2.13465
2.754821


Consumer Non-durables
-4.61437
1.342282
-0.86093
3.745072


Electrical Machinery
20.93233
-10.0166
2.882682
13.14297


Electricity
3.536453
1.321586
6.035889
-0.50533


General Index
4.219653
3.77907
2.509983
3.011002


Intermediate Goods
1.544402
0.793651
1.286174
2.338009


Manufacturing
4.665203
4.608551
2.016349
3.914002


Mining & Quarrying
1.290878
-0.3276
2.314445
-0.57096


Motor Vehicles
7.929515
8.59375
4.213865
6.773455


Publishing & Printing 
-9.46548
-11.4155
-9.23077
6.088993


Rubber & Plastics Products
-4.25311
0.104275
1.469098
5.604396


Textiles
-0.5

In [16]:
# better find <tr>
print('\n*** Page Table Text Using <tr> & <td> ***')
t_rows = table.find_all('tr')
print(t_rows)
for tr in t_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    print(row)                    # I need to save this data instead of printing it 



*** Page Table Text Using <tr> & <td> ***
[<tr>
<td>Item Description</td>
<td class="has-text-align-right" data-align="right">Jul-15</td>
<td class="has-text-align-right" data-align="right">Aug-15</td>
<td class="has-text-align-right" data-align="right">Sep-15</td>
<td class="has-text-align-right" data-align="right">Oct-15</td>
</tr>, <tr>
<td>Basic Goods</td>
<td class="has-text-align-right" data-align="right">5.159705</td>
<td class="has-text-align-right" data-align="right">5.137615</td>
<td class="has-text-align-right" data-align="right">6.223818</td>
<td class="has-text-align-right" data-align="right">2.638037</td>
</tr>, <tr>
<td>Basic metals</td>
<td class="has-text-align-right" data-align="right">6.812879</td>
<td class="has-text-align-right" data-align="right">10.89866</td>
<td class="has-text-align-right" data-align="right">9.526044</td>
<td class="has-text-align-right" data-align="right">7.349455</td>
</tr>, <tr>
<td>Capital Goods</td>
<td class="has-text-align-right" data-a

In [17]:
# <a ....> tag
print('\n*** Page <a ...> Text ***')
a_tags = soup.find_all('a')
# print the http links 
for link in a_tags:
    print(link.get('href'))


*** Page <a ...> Text ***
https://en.wikipedia.org/wiki/Nabonidus#The_Persian_conquest_of_Babylonia
https://en.wikipedia.org/wiki/Cyrus%27s_edict
https://en.wikipedia.org/wiki/Temple_in_Jerusalem
https://en.wikipedia.org/wiki/Cyrus
https://en.wikipedia.org/wiki/Xerxes_I
https://en.wikipedia.org/wiki/Artaxerxes_I
https://en.wikipedia.org/wiki/Darius_II_of_Persia
https://en.wikipedia.org/wiki/Darius_II
https://en.wikipedia.org/wiki/Ecbatana
mailto:info@maexadata.in
https://www.geeksforgeeks.org/machine-learning/
https://www.geeksforgeeks.org/ml-linear-regression/
https://www.geeksforgeeks.org/basic-concept-classification-data-mining/
https://www.geeksforgeeks.org/clustering-in-machine-learning/
https://scikit-learn.org/stable/
