In [24]:
import numpy as np
import pandas as pd
import requests
import warnings
warnings.filterwarnings("ignore")

In [25]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

## Get links from table elements

In [81]:
def get_links(site,n_pages):
    links = []
    for i in range(1,n_pages+1):
        option = webdriver.ChromeOptions()
        option.add_argument('headless')
        driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)
        driver.get(site+f"?PageId={i}")
        timeout = 30
        try:
            WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "listing")))
            tds = driver.find_elements(By.TAG_NAME,"td")
            for el in tds:
                links.append(el.find_element(By.TAG_NAME,'a').get_attribute("href"))
            driver.quit()
        except TimeoutException:
            print('Oh no')
            driver.quit()
    return links

In [83]:
links = get_links('http://www.mapasocial.pt/pt/centro-de-acolhimento-temporario/V11',2)



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [/Users/margaridacampos/.wdm/drivers/chromedriver/mac64/97.0.4692.71/chromedriver] found in cache


Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [/Users/margaridacampos/.wdm/drivers/chromedriver/mac64/97.0.4692.71/chromedriver] found in cache


In [80]:
unique_links = list(set(links))
print(f'There are {len(unique_links)} Centros de Acolhimento Temporário')

There are 93 Centros de Acolhimento Temporário


## Get House Info from link

In [134]:
def scrape_page(links):    
    info = {'names':[],
        'owners':[],
        'juridicas':[],
        'moradas':[],
        'cps':[],
        'tels':[],
        'emails':[],
        'caps':[],
        'uts':[],
        'hors':[],
        'acts':[]
       }
    for link in links:
        print(link)
        option = webdriver.ChromeOptions()
        option.add_argument('headless')
        driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)
        driver.get(link)
        timeout = 30
        try:
            WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "table")))
            name = driver.find_element(By.CLASS_NAME,"main-heading").text
            props = driver.find_elements(By.CLASS_NAME,"col-md-6")
            owner = props[0].text.split('\n')[1]
            juridica = props[1].text.split('\n')[1]
            locs = driver.find_elements(By.TAG_NAME,'tr')
            morada = (locs[1].find_elements(By.TAG_NAME,'td')[1]).text
            cp = (locs[2].find_elements(By.TAG_NAME,'td')[1]).text
            tel = (locs[3].find_elements(By.TAG_NAME,'td')[1]).text
            email = (locs[4].find_elements(By.TAG_NAME,'td')[1]).text
            caps = []
            uts = []
            hors = []
            acts = []
            for i in range(6,len(locs)):
                caps.append((locs[i].find_elements(By.TAG_NAME,'td')[1]).get_attribute('innerHTML'))
                uts.append((locs[i].find_elements(By.TAG_NAME,'td')[2]).get_attribute('innerHTML'))
                hors.append((locs[i].find_elements(By.TAG_NAME,'td')[3]).get_attribute('innerHTML'))
                acts.append((locs[i].find_elements(By.TAG_NAME,'td')[4]).get_attribute('innerHTML')) 
            info['names'].append(name)
            info['owners'].append(owner)
            info['juridicas'].append(juridica)        
            info['moradas'].append(morada)        
            info['cps'].append(cp)        
            info['tels'].append(tel)        
            info['emails'].append(email)        
            info['caps'].append(caps)        
            info['uts'].append(uts)        
            info['hors'].append(hors)        
            info['acts'].append(acts)           
            driver.quit()
        except TimeoutException:
            print('Oh no')
            driver.quit()
    return info

In [None]:
info = scrape_page(unique_links)

In [33]:
unable = 'http://www.mapasocial.pt/pt/centro-de-acolhimento-temporario-associacao-a-terra-dos-homens/I7947'

In [34]:
clean_links = unique_links.copy()
clean_links.remove(unable)

In [153]:
def get_df_from_dict(info_dict,clean_links,tipo):
    data = pd.DataFrame(info_dict)
    data['url'] = clean_links
    data_clean = data.copy()
    data_clean = data_clean[data_clean.caps!='0']
    data_clean['cp'] = data_clean.cps.map(lambda x : x.split()[0])
    data_clean['loc'] = data_clean.cps.map(lambda x : (' '.join(x.split()[1:])).upper())
    data_clean.drop(columns = 'cps', inplace = True)
    data_clean = data_clean.drop(columns = ['caps','uts','hors','acts'])\
            .merge(data_clean[['names','caps','uts','hors','acts']].set_index(['names']).apply(pd.Series.explode).reset_index()\
                   .sort_values(by = 'acts',ascending = False)\
                   .groupby('names').head(1),
                  on = 'names', how = 'left')
    data_clean.caps = data_clean.caps.astype(int)
    data_clean.uts = data_clean.uts.astype(int)
    data_clean['Tipo'] = tipo
    res = data_clean[['Tipo','names','loc','emails','tels','caps','uts',
                'moradas','owners','juridicas','cp','hors','acts']]\
        .rename(columns = {'names': 'Nome',
                           'loc': 'Localidade',
                           'emails': 'Email',
                           'tels': 'Telefone',
                           'caps': 'Capacidade',
                           'uts': 'Utentes',
                           'moradas': 'Morada',
                           'owners': 'Dono',
                           'juridicas': 'Juridição',
                           'cp': 'Codigo Postal',
                           'hors': 'Horario',
                           'acts': 'Data Info'})
    return res

In [154]:
casas_temp = get_df_from_dict(info,clean_links,'Centro de Acolhimento Temporário')

In [155]:
print(f'Há capacidade para {casas_temp.Capacidade.sum()} das quais {casas_temp.Utentes.sum()}({100*(casas_temp.Utentes.sum()/casas_temp.Capacidade.sum()):.0f}%) estão ocupadas')

Há capacidade para 1969 das quais 1527(78%) estão ocupadas


In [None]:
links_lares = get_links('http://www.mapasocial.pt/pt/lar-de-infancia-e-juventude/V14',5)

In [85]:
unique_links_lares = list(set(links_lares))
print(f'There are {len(unique_links_lares)} Lares de Infância e Juventude')

There are 196 Lares de Infância e Juventude


In [None]:
info_lares = scrape_page(unique_links_lares)

In [156]:
lares = get_df_from_dict(info_lares,unique_links_lares,'Lar de Infância e Juventude')

In [159]:
casas = pd.concat([casas_temp,lares])
casas.to_csv('./casas_mapa_social.csv',index = False)