In [4]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt 
import statsmodels.formula.api as smf

## 1st method for web scraping (Http, HTML)  

In [5]:
# libraries for web scraping (Http, HTML) 

from urllib.request import urlopen, Request

In [6]:
url = "https://www.wikipedia.org/"
request = Request(url)
response = urlopen(request)
html = response.read()
response.close()

## 2nd method for web scraping (Http, HTML)  

In [35]:
import requests
url = "https://www.wikipedia.org/"
r = requests.get(url)
html_text = r.text

# Beautifull soup to parse HTML


In [72]:
from bs4 import BeautifulSoup
url = "https://www.wikipedia.org/"
r = requests.get(url)
html_text = r.text
soup = BeautifulSoup(html_text, "lxml")

# or 

soup = BeautifulSoup(html_text)
pretty_soup = soup.prettify()

In [73]:
# to print title

soup.title

<title>Wikipedia</title>

In [74]:
# to find all links in html file 

for link in soup.find_all("a"):
    print (link.get("href"))

//en.wikipedia.org/
//ja.wikipedia.org/
//ru.wikipedia.org/
//de.wikipedia.org/
//es.wikipedia.org/
//fr.wikipedia.org/
//zh.wikipedia.org/
//it.wikipedia.org/
//pl.wikipedia.org/
//pt.wikipedia.org/
//pl.wikipedia.org/
//ar.wikipedia.org/
//de.wikipedia.org/
//en.wikipedia.org/
//es.wikipedia.org/
//fr.wikipedia.org/
//it.wikipedia.org/
//arz.wikipedia.org/
//nl.wikipedia.org/
//ja.wikipedia.org/
//pt.wikipedia.org/
//ru.wikipedia.org/
//ceb.wikipedia.org/
//sv.wikipedia.org/
//uk.wikipedia.org/
//vi.wikipedia.org/
//war.wikipedia.org/
//zh.wikipedia.org/
//af.wikipedia.org/
//sk.wikipedia.org/
//ast.wikipedia.org/
//az.wikipedia.org/
//bg.wikipedia.org/
//zh-min-nan.wikipedia.org/
//bn.wikipedia.org/
//be.wikipedia.org/
//ca.wikipedia.org/
//cs.wikipedia.org/
//cy.wikipedia.org/
//da.wikipedia.org/
//et.wikipedia.org/
//el.wikipedia.org/
//eo.wikipedia.org/
//eu.wikipedia.org/
//fa.wikipedia.org/
//gl.wikipedia.org/
//ko.wikipedia.org/
//hy.wikipedia.org/
//hi.wikipedia.org/
//hr.wik

# Find tables on HTML


In [75]:
url = "https://pt.wikipedia.org/wiki/Lista_de_bairros_de_Manaus"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "lxml")

In [77]:
# Verifying tables and their classes

print('Classes of each table:')
for table in soup.find_all('table'):
    print(table.get('class'))

Classes of each table:
['box-Desatualizado', 'plainlinks', 'metadata', 'ambox', 'ambox-content']
['wikitable', 'sortable']
['nowraplinks', 'collapsible', 'collapsed', 'navbox-inner']


In [136]:
url = "https://www.x-rates.com/"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "lxml")
table = soup.find("table")
# tr >> table rows
table_rows = soup.find_all("tr")

# td >> table data 

for tr in table_rows:
    td = tr.find_all("td")
    row = [i.text for i in td]
    print(row)
    print("**********************")

[]
**********************
['1', '0.73808', '1.27244', '0.88405', '1.38098']
**********************
['1.35487', '1', '1.72398', '1.19778', '1.87104']
**********************
['0.78589', '0.58005', '1', '0.69477', '1.08530']
**********************
['1.13115', '0.83488', '1.43932', '1', '1.56210']
**********************
['0.72412', '0.53446', '0.92140', '0.64016', '1']
**********************


In [141]:

dfs = pd.read_html("https://www.x-rates.com/" )
len(dfs)

1

In [167]:
df = dfs[0]
dfss = df.iloc[:,1:]
dfss

Unnamed: 0,USD,GBP,CAD,EUR,AUD
0,1.0,0.73813,1.27264,0.8841,1.38116
1,1.35478,1.0,1.72414,1.19776,1.87117
2,0.78577,0.58,1.0,0.6947,1.08528
3,1.13109,0.83489,1.43947,1.0,1.56223
4,0.72403,0.53443,0.92142,0.64011,1.0


In [173]:
url = "https://money.cnn.com/data/currencies/"

r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "lxml")
table = soup.find("table")
# tr >> table rows
table_rows = soup.find_all("tr")

# td >> table data 

for tr in table_rows:
    td = tr.find_all("td")
    row = [i.text for i in td]
    print(row)
    print("**********************")

[]
**********************
['Argentinean Peso', '107.1950', '+0.1150', '+0.107%', '88.96Today|||107.19']
**********************
['Brazilian Real', '5.0588', '-0.0451', '-0.884%', '4.89Today|||5.88']
**********************
['Canadian Dollar', '1.2770', '+0.0018', '+0.145%', '1.20Today|||1.30']
**********************
['Chilean Peso', '792.7150', '-5.8250', '-0.729%', '689.95Today|||875.40']
**********************
['Dominican Peso', '55.9200', '-0.7000', '-1.236%', '55.92Today|||58.00']
**********************
['Mexican Peso', '20.3050', '-0.0045', '-0.022%', '19.59Today|||22.15']
**********************
[]
**********************
['\xa0', 'NIKKEI 225 INDEX\xa0\xa0Japan', '-1.71%', '26,449.61']
**********************
['\xa0', 'HANG SENG INDEX\xa0\xa0China', '+0.62%', '23,665.25']
**********************
['\xa0', 'FTSE 100 IDX\xa0\xa0England', '+0.05%', '7,498.18']
**********************
['\xa0', 'CAC 40 INDEX\xa0\xa0France', '-0.10%', '6,780.67']
**********************
['\xa0', 'DEUTSCHE BORSE

In [174]:
url = "https://money.cnn.com/data/currencies/"
dfs = pd.read_html(url )
len(dfs)

2

In [175]:
dfs[0]

Unnamed: 0,Currencies,$1=,Change inU.S. dollars,% Change,52-week range
0,Argentinean Peso,107.195,0.115,+0.107%,88.96Today|||107.19
1,Brazilian Real,5.0588,-0.0451,-0.884%,4.89Today|||5.88
2,Canadian Dollar,1.277,0.0018,+0.145%,1.20Today|||1.30
3,Chilean Peso,792.715,-5.825,-0.729%,689.95Today|||875.40
4,Dominican Peso,55.92,-0.7,-1.236%,55.92Today|||58.00
5,Mexican Peso,20.305,-0.0045,-0.022%,19.59Today|||22.15


In [176]:
dfs[1]

Unnamed: 0.1,Unnamed: 0,Index,1 day change,Level
0,,NIKKEI 225 INDEX Japan,-1.71%,26449.61
1,,HANG SENG INDEX China,+0.62%,23665.25
2,,FTSE 100 IDX England,+0.05%,7498.18
3,,CAC 40 INDEX France,-0.10%,6780.67
4,,DEUTSCHE BORSE DAX INDEX Germany,-0.42%,14631.36
