In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup

In [2]:
page = requests.get('http://dataquestio.github.io/web-scraping-pages/simple.html')
print(page.status_code)

200


In [3]:
print(page.content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [4]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.p
soup.find('p')

<p>Here is some simple content for this page.</p>

In [5]:
soup.p.string
soup.p.text
soup.p.get_text()

'Here is some simple content for this page.'

In [6]:
page = requests.get('http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html')
page

<Response [200]>

In [7]:
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [8]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>



In [9]:
print(soup.find_all('p'))

[<p class="inner-text first-item" id="first">
                First paragraph.
            </p>, <p class="inner-text">
                Second paragraph.
            </p>, <p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>, <p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>]


In [10]:
soup.findAll('p')[0].text.strip()

'First paragraph.'

In [11]:
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [12]:
soup.find_all(id='first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [13]:
soup.find_all('p', class_='outer-text')
soup.find_all('p', {'class':'outer-text'})

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [14]:
soup.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>]

In [15]:
soup.select('#first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India'
page = requests.get(url)
page

<Response [200]>

In [17]:
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of state and union territory capitals in India - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-

In [18]:
table = soup.find_all('table')[1]
state = []
adm_capital = []
leg_capital = []
jud_capital = []
est_year = []
prev_capital = []
for row in table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == 6:
        state.append(cells[0].text.strip())
        adm_capital.append(cells[1].text.strip())
        leg_capital.append(cells[2].text.strip())
        jud_capital.append(cells[3].text.strip())
        est_year.append(cells[4].text.strip())
        prev_capital.append(cells[5].text.strip())

In [19]:
df = pd.DataFrame({
    'State': state,
    'Adm Capital': adm_capital,
    'Leg Capital': leg_capital,
    'Jud Capital': jud_capital,
    'Est Year': est_year,
    'Former Capital': prev_capital})
df

Unnamed: 0,State,Adm Capital,Leg Capital,Jud Capital,Est Year,Former Capital
0,Andhra Pradesh,Amaravati,Amaravati,Amaravati,2014,Hyderabad[a](1956–2014)
1,Arunachal Pradesh,Itanagar,Itanagar,Guwahati,1987,—
2,Assam,Dispur,Dispur,Guwahati,1972,Shillong[b] (1950–1972)
3,Bihar,Patna,Patna,Patna,1950,—
4,Chhattisgarh,Raipur[c],Raipur,Bilaspur,2000,—
5,Goa,Panaji[d],Porvorim,Mumbai,1987,—
6,Gujarat,Gandhinagar,Gandhinagar,Ahmedabad,1970,Ahmedabad (1960–1970)
7,Haryana,Chandigarh,Chandigarh,Chandigarh,1966,—
8,Himachal Pradesh,Shimla,Shimla (Summer)Dharamshala (Winter)[5],Shimla,1971,—
9,Jharkhand,Ranchi,Ranchi,Ranchi,2000,—


In [20]:
df.to_csv('table.csv')