# 2.2 - Web Scraping (bs4)

**[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)**.

In [None]:
!pip install beautifulsoup4

In [1]:
import requests as req

from bs4 import BeautifulSoup as bs

### Ejemplos Wikipedia

**[Países europeos según esperanza de vida](https://en.wikipedia.org/wiki/List_of_European_countries_by_life_expectancy)**

In [2]:
url='https://en.wikipedia.org/wiki/List_of_European_countries_by_life_expectancy'

In [3]:
# usar requests para extraer el html

html=req.get(url).content

html[:1000]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of European countries by life expectancy - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"6b5ce1f3-6d13-4e06-a29c-89a556c46119","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_European_countries_by_life_expectancy","wgTitle":"List of European countries by life expectancy","wgCurRevisionId":1008355918,"wgRevisionId":1008355918,"wgArticleId":22175559,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is differen

In [5]:
# bs4 para el manejo del html

soup=bs(html, 'html.parser')

type(soup)

bs4.BeautifulSoup

In [7]:
# sacamos la tabla

tabla=soup.find('table')

type(tabla)

bs4.element.Tag

In [18]:
# extraccion de filas

filas=tabla.find_all('tr')

filas=[f.text.strip().split('\n') for f in filas]

filas[:3]

[['Rank',
  '',
  'Country',
  'Life expectancy[1]',
  '',
  'Influenza vaccination rate, people aged 65 and over, 2016 (%)[2]'],
 ['1', '', '\xa0Monaco[3]', '', '89.4'],
 ['2', '', '\xa0San Marino[4]', '', '83.4']]

In [35]:
# minima limpieza

final=[]

for e in filas:
    tmp=[]
    for st in e:
        if st!='':
            tmp.append(st.replace('\xa0', ''))
            
    final.append(tmp)
    
final[:3]

[['Rank',
  'Country',
  'Life expectancy[1]',
  'Influenza vaccination rate, people aged 65 and over, 2016 (%)[2]'],
 ['1', 'Monaco[3]', '89.4'],
 ['2', 'San Marino[4]', '83.4']]

In [36]:
import pandas as pd

col_names=final[0]

data=final[1:]

df=pd.DataFrame(data, columns=col_names)

df.head()

Unnamed: 0,Rank,Country,Life expectancy[1],"Influenza vaccination rate, people aged 65 and over, 2016 (%)[2]"
0,1,Monaco[3],89.4,
1,2,San Marino[4],83.4,
2,3,Switzerland,83.0,
3,4,Spain,82.8,56%
4,5,Liechtenstein,82.7,28%


$$$$

**[Medallero Barcelona'92](https://es.wikipedia.org/wiki/Juegos_Ol%C3%ADmpicos_de_Barcelona_1992)**

In [37]:
url='https://es.wikipedia.org/wiki/Juegos_Ol%C3%ADmpicos_de_Barcelona_1992'

In [38]:
response=req.get(url)

In [39]:
data=response.text    # no importa text o content, bs4 lo reconoce igual

soup=bs(data, 'html.parser')

type(soup)

bs4.BeautifulSoup

In [40]:
# todas las tablas

tablas=soup.find_all('table')

ultima=tablas[-1]

In [41]:
# miramos un elemento de la tabla

elem=ultima.find('a')

elem

<a href="/wiki/Control_de_autoridades" title="Control de autoridades">Control de autoridades</a>

In [42]:
elem.text    # contenido de la caja

'Control de autoridades'

In [46]:
print(elem.content)

None


In [47]:
elem.contents # contenido de la caja como lista

['Control de autoridades']

In [48]:
elem.attrs   # pegatinas en la tapa de la caja

{'href': '/wiki/Control_de_autoridades', 'title': 'Control de autoridades'}

In [49]:
elem.attrs['href']

'/wiki/Control_de_autoridades'

In [50]:
medallero=tablas[-4]

In [67]:
med_paises=[]


for f in medallero.find_all('tr'):   # lista de las filas de la tabla
    
    fila=[e for e in f.find_all('td')] # elementos dentero de la fila de la tabla
    
    if len(fila)>0:
        
        pais={'nombre':fila[1].find('a').text.strip(),
              'oros':fila[2].text,
              'platas':fila[3].text,
              'bronces':fila[4].text}
        
        med_paises.append(pais)
        
med_paises

[{'nombre': 'Equipo Unificado', 'oros': '45', 'platas': '38', 'bronces': '29'},
 {'nombre': 'Estados Unidos', 'oros': '37', 'platas': '34', 'bronces': '37'},
 {'nombre': 'Alemania', 'oros': '33', 'platas': '21', 'bronces': '28'},
 {'nombre': 'China', 'oros': '16', 'platas': '22', 'bronces': '16'},
 {'nombre': 'Cuba', 'oros': '14', 'platas': '6', 'bronces': '11'},
 {'nombre': 'España', 'oros': '13', 'platas': '7', 'bronces': '2'},
 {'nombre': 'Corea del Sur', 'oros': '12', 'platas': '5', 'bronces': '12'},
 {'nombre': 'Hungría', 'oros': '11', 'platas': '12', 'bronces': '7'},
 {'nombre': 'Francia', 'oros': '8', 'platas': '5', 'bronces': '16'},
 {'nombre': 'Australia', 'oros': '7', 'platas': '9', 'bronces': '11'}]

In [68]:
df=pd.DataFrame(med_paises)

df

Unnamed: 0,nombre,oros,platas,bronces
0,Equipo Unificado,45,38,29
1,Estados Unidos,37,34,37
2,Alemania,33,21,28
3,China,16,22,16
4,Cuba,14,6,11
5,España,13,7,2
6,Corea del Sur,12,5,12
7,Hungría,11,12,7
8,Francia,8,5,16
9,Australia,7,9,11


### Ejemplo geolocalización por IP

https://tools.keycdn.com/geo

**¿Dónde estoy?**

In [69]:
url='https://tools.keycdn.com/geo'

In [70]:
html=req.get(url).content

soup=bs(html, 'html.parser')

In [71]:
soup.find('div', {'id': 'geoResult'})

<div class="mt-4" id="geoResult">
<div class="bg-light medium rounded p-3">
<p class="small text-uppercase text-muted font-weight-semi-bold line-height-headings mb-2">Location</p> <dl class="row mb-0">
<dt class="col-4">Country</dt><dd class="col-8 text-monospace">Spain (ES)</dd><dt class="col-4">Continent</dt><dd class="col-8 text-monospace">Europe (EU)</dd><dt class="col-4">Coordinates</dt><dd class="col-8 text-monospace">40.4172 (lat) / -3.684 (long)</dd><dt class="col-4">Time</dt><dd class="col-8 text-monospace">2021-05-11 11:31:52 (Europe/Madrid)</dd> </dl>
<p class="small text-uppercase text-muted font-weight-semi-bold line-height-headings mt-4 mb-2">Network</p>
<dl class="row mb-0">
<dt class="col-4">IP address</dt><dd class="col-8 text-monospace">2.136.118.159</dd><dt class="col-4">Hostname</dt><dd class="col-8 text-monospace">159.red-2-136-118.staticip.rima-tde.net</dd><dt class="col-4">Provider</dt><dd class="col-8 text-monospace">Telefonica De Espana</dd><dt class="col-4">AS

In [72]:
tabla=soup.find('div', {'id': 'geoResult'})

mi_conexion=[e.text for e in tabla.find_all('dd', {'class': 'col-8 text-monospace'})]

mi_conexion

['Spain (ES)',
 'Europe (EU)',
 '40.4172 (lat) / -3.684 (long)',
 '2021-05-11 11:31:52 (Europe/Madrid)',
 '2.136.118.159',
 '159.red-2-136-118.staticip.rima-tde.net',
 'Telefonica De Espana',
 '3352']

**Búsqueda según IP**

https://tools.keycdn.com/geo?host=137.255.90.7

In [73]:
ip='137.255.90.7'

url=f'https://tools.keycdn.com/geo?host={ip}'

In [74]:
html=req.get(url).content

soup=bs(html, 'html.parser')

In [75]:
[e.text for e in soup.find_all('dd', {'class': 'col-8 text-monospace'})]

['Benin (BJ)',
 'Africa (AF)',
 '9.5 (lat) / 2.25 (long)',
 '2021-05-11 10:37:18 (Africa/Porto-Novo)',
 '137.255.90.7',
 '137.255.90.7',
 'Office des Postes et telecommunications du Benin',
 '28683']

### Ejemplo terremotos (EMSC)

https://www.emsc-csem.org/Earthquake/

In [76]:
url='https://www.emsc-csem.org/Earthquake/'

In [77]:
html=req.get(url).content

soup=bs(html, 'html.parser')

In [79]:
tabla=soup.select('#tbody tr')  # selectores de css

elem=tabla[0].select('td')

elem

[<td class="tabev0"></td>,
 <td class="tabev0"></td>,
 <td class="tabev0"></td>,
 <td class="tabev6"><b><i style="display:none;">earthquake</i><a href="/Earthquake/earthquake.php?id=982889">2021-05-11   09:21:15.5</a></b><i class="ago" id="ago0">18min ago</i></td>,
 <td class="tabev1">16.31 </td>,
 <td class="tabev2">S  </td>,
 <td class="tabev1">177.67 </td>,
 <td class="tabev2">W  </td>,
 <td class="tabev3">40</td>,
 <td class="tabev5" id="magtyp0">mb</td>,
 <td class="tabev2">4.8</td>,
 <td class="tb_region" id="reg0"> FIJI REGION</td>,
 <td class="comment updatetimeno" id="upd0" style="text-align:right;">2021-05-11 09:36</td>]

In [80]:
terremoto={
    'date': elem[3].find('a').text,
    'region': elem[-2].text,
    'lat': elem[4].text + elem[5].text,
    'lng': elem[6].text + elem[7].text
}

In [81]:
terremoto

{'date': '2021-05-11\xa0\xa0\xa009:21:15.5',
 'region': '\xa0FIJI REGION',
 'lat': '16.31\xa0S\xa0\xa0',
 'lng': '177.67\xa0W\xa0\xa0'}

In [None]:
# problema de codificacion

!pip install unidecode

In [83]:
import unidecode

terremoto={k:unidecode.unidecode(v).strip() for k,v in terremoto.items()}

terremoto

{'date': '2021-05-11   09:21:15.5',
 'region': 'FIJI REGION',
 'lat': '16.31 S',
 'lng': '177.67 W'}

In [84]:
soup.select('#tbody tr td a') # multiseleccion, hacia adentro

[<a href="/Earthquake/earthquake.php?id=982889">2021-05-11   09:21:15.5</a>,
 <a href="/Earthquake/earthquake.php?id=982886">2021-05-11   08:54:52.8</a>,
 <a href="/Earthquake/earthquake.php?id=982885">2021-05-11   08:52:20.9</a>,
 <a href="/Earthquake/earthquake.php?id=982883">2021-05-11   08:47:40.0</a>,
 <a href="/Earthquake/earthquake.php?id=982884">2021-05-11   08:43:47.0</a>,
 <a href="/Earthquake/earthquake.php?id=982879">2021-05-11   08:25:40.3</a>,
 <a href="/Earthquake/earthquake.php?id=982878">2021-05-11   08:21:25.5</a>,
 <a href="/Earthquake/earthquake.php?id=982875">2021-05-11   08:02:36.7</a>,
 <a href="/Earthquake/earthquake.php?id=982872">2021-05-11   07:50:21.0</a>,
 <a href="/Earthquake/earthquake.php?id=982882">2021-05-11   07:25:52.4</a>,
 <a href="https://www.emsc-csem.org/Earthquake/Testimonies/comments.php?id=982835" onmouseout="info_b2('notshow','');" onmouseover="info_b2('show','See the &lt;b&gt;10 testimonies&lt;/b&gt; for this earthquake');"><span class="" s

### Ejemplo LinkedIn

https://www.linkedin.com/jobs/search/?f_E=4&f_TPR=r2592000&geoId=105646813&keywords=data%20analyst&location=Espa%C3%B1a&start=25

In [85]:
def linkedin_search(keywords, num_pages, country, n_secs, exp):
    
    URL='https://www.linkedin.com/jobs/search/'
    
    data=[]
    
    for i in range(num_pages):
        
        scrape_url=''.join([URL,                    # url principal
                           '?keywords=', keywords,  # palabras claves de busqueda
                           f'&location={country}',  # pais
                           f'&f_TPR={n_secs}',      # numero de segundos atras
                           f'&f_E={exp}',           # experiencia
                           f'&start={i*25}'
                           ])
        
        
        html=req.get(scrape_url)
        
        soup=bs(html.text, 'html.parser')
        
        for card in soup.select('div.result-card__contents'):
            
            try:
                comp_link=card.select('a', class_='result-card__subtitle-link job-result-card__subtitle-link')[0].attrs['href']
            except:
                comp_link=''
                
            title=card.findChild('h3', recursive=False)
            company=card.findChild('h4', recursive=False)
            location=card.findChild('span', recursive=True, attrs={'class': 'job-result-card__location'})
            datetime=card.findChild('time', recursive=True).attrs['datetime']
            
            try:
                desc=card.select('p')[0].text
            except:
                desc=None
                
                
            data.append({'title': title.string,
                        'country': country,
                        'location': location.string,
                        'date': datetime,
                        'description': desc,
                        'company_link': comp_link,
                        'experience': exp,
                        'keywords': keywords})
            
            
    return pd.DataFrame(data)
            
            

In [86]:
linkedin_search('data', 3, 'españa', 30000, 3)

Unnamed: 0,title,country,location,date,description,company_link,experience,keywords
0,Data Scientist - IA,españa,Greater Madrid Metropolitan Area,2021-05-11,En artyco ofrecemos una propuesta de valor dat...,https://es.linkedin.com/company/artyco-the-dat...,3,data
1,Científico de datos - Navegación Web,españa,"Madrid, Community of Madrid, Spain",2021-04-27,Crear análisis predictivos teniendo presente l...,https://es.linkedin.com/company/elcorteingles?...,3,data
2,Data Scientist,españa,"Madrid, Community of Madrid, Spain",2021-05-10,1 or 2 years of experience as Data Scientist o...,https://es.linkedin.com/company/axactorspain?t...,3,data
3,Data Analyst,españa,"Barcelona, Catalonia, Spain",2021-05-11,Extensive knowledge and experience with report...,https://www.linkedin.com/company/raytheonprofe...,3,data
4,Data Miner,españa,"Madrid, Community of Madrid, Spain",2021-05-04,"Solid critical thinking, analytical and creati...",https://es.linkedin.com/company/villarroel-&-h...,3,data
...,...,...,...,...,...,...,...,...
67,Associate Data Research Analyst,españa,"Community of Madrid, Spain",2021-05-10,You will be collaborating with cross functiona...,,3,data
68,Data Analyst (Tarragona),españa,Greater Tarragona Area,2021-05-06,Empresa conocida situada en Tarragona busca in...,https://es.linkedin.com/company/first-talent-b...,3,data
69,Técnico/a Gestión de Datos (Data Management),españa,"Vacarisses, Catalonia, Spain",2021-04-28,"Integrado en el Departamento de Diseño, formar...",https://es.linkedin.com/company/manpowergroup-...,3,data
70,Data Scientist- Analista- Programador BI,españa,"Canary Islands, Spain",2021-05-07,Buscamos una persona que se incorpore a nuestr...,https://es.linkedin.com/company/grupo-disa?trk...,3,data
