In [156]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import perf_counter, sleep
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from multiprocessing import Pool

In [171]:
START_URL = 'http://notasdecorte.es/zona/andalucia'


In [172]:
r = requests.get(START_URL)
soup = BeautifulSoup(r.text, 'html.parser')

In [14]:
def get_university(soup):
    divisions = soup.find_all("div", class_ = "field-content")
    university = []
    count = 0
    for division in divisions:
        if count % 2 == 0:
            title = division.a.text
            university.append(title)
        count += 1
    return university

In [15]:
def get_province(soup):
    divisions1 = soup.find_all("div", class_ = "titul-list-provincia text-right")
    province = []
    for division in divisions1:
        title = division.strong.text
        province.append(title)
    return province

In [16]:
def get_modality(soup):
    divisions2 = soup.find_all("div", class_ = "titul-list-modalidad text-right")
    modality = []
    for division in divisions2:
        title = division.text
        modality.append(title)
    return modality

In [17]:
def get_grade(soup):
    divisions3 = soup.find_all("div", class_ = "titul-list-nota-corte-nota")
    grade = []
    for division in divisions3:
        title = division.text
        grade.append(title)
    return grade

In [18]:
def get_public(soup):   
    spans = soup.find_all("span", class_ = "label label-primary")
    public = []
    for span in spans:
        title = span.text
        public.append(title)
    return public

In [19]:
def get_degree(soup):    
    divisions4 = soup.select("div span a")
    degree = []
    count = 0
    for division in divisions4:
        if count % 2 == 0:
            title = division.text
            degree.append(title)
        count += 1
    return degree

In [20]:
def get_web(soup):
    spansweb = soup.select("span.views-field.views-field-field-yaq-facultad-web span.field-content")
    web = []
    for span in spansweb:
        title = span.text
        web.append(title)
    return web

In [21]:
def get_duration(soup):
    spansd = soup.select("div.views-field.views-field-field-duracion span.field-content")
    duration = []
    for span in spansd:
        title = span.text
        duration.append(title)
    return duration

In [22]:
def get_cost(soup):
    spansc = soup.select("div.views-field.views-field-field-precio-primer-ano span.field-content")
    cost = []
    for span in spansc:
        title = span.text
        cost.append(title)
    return cost

In [23]:
def aggregate(soup):
    data = [("University", get_university(soup)),("Degree",get_degree(soup)),("Grade",get_grade(soup)),
            ("Province", get_province(soup)),("Modality",get_modality(soup)),
            ("Public/Private",get_public(soup)), ("Web", get_web(soup)), ("Duration", get_duration(soup)),
            ("Cost",get_cost(soup))]
    data1 = []
    length = len(data[0][1])
    for element in data:
        if len(element[1]) == length:
            data1.append(element)
    df = pd.DataFrame.from_dict(dict(data1))
    return df

    
            

In [44]:
START_URL1 = 'http://notasdecorte.es/'

In [119]:
r = requests.get(START_URL1)
soup = BeautifulSoup(r.text, 'html.parser')

In [68]:
col1 = soup.select("table.views-view-grid.cols-2 td.col-1.col-first div.field-content a")
col2 = soup.select("table.views-view-grid.cols-2 td.col-2.col-last div.field-content a")

In [91]:
%%time
links = []
for col in col1:
    d = col["href"]
    link = "http://notasdecorte.es/" + d
    links.append(link)
    
for col in col2:
    d = col["href"]
    link = "http://notasdecorte.es/" + d
    links.append(link)

CPU times: user 28 µs, sys: 1e+03 ns, total: 29 µs
Wall time: 32.9 µs


In [92]:
links

['http://notasdecorte.es/zona/andalucia',
 'http://notasdecorte.es/zona/aragon',
 'http://notasdecorte.es/zona/asturias',
 'http://notasdecorte.es/zona/cantabria',
 'http://notasdecorte.es/zona/castilla-y-leon',
 'http://notasdecorte.es/zona/castilla-la-mancha',
 'http://notasdecorte.es/zona/cataluna',
 'http://notasdecorte.es/zona/ceuta-y-melilla',
 'http://notasdecorte.es/zona/extremadura',
 'http://notasdecorte.es/zona/galicia',
 'http://notasdecorte.es/zona/islas-baleares',
 'http://notasdecorte.es/zona/islas-canarias',
 'http://notasdecorte.es/zona/la-rioja',
 'http://notasdecorte.es/zona/madrid',
 'http://notasdecorte.es/zona/navarra',
 'http://notasdecorte.es/zona/pais-vasco',
 'http://notasdecorte.es/zona/region-de-murcia',
 'http://notasdecorte.es/zona/valencia']

In [49]:
def linkscrape(url):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    aggregate(soup).to_csv(url.rpartition("zona/")[2]+".csv", sep = ',', encoding = "utf-8")
    
    pass


In [52]:
linkscrape(links[2])

In [106]:
%%time

with ThreadPoolExecutor(18) as pool:
    pool.map(linkscrape, links)


CPU times: user 4.71 s, sys: 195 ms, total: 4.9 s
Wall time: 4.79 s


In [93]:
%%time
for link in links:
    linkscrape(link)

CPU times: user 4.66 s, sys: 98.8 ms, total: 4.76 s
Wall time: 13.7 s


In [203]:
%%time
def aggregate2(soup):
    data = []
    with ThreadPoolExecutor(max_workers = 9) as pool:
        data.append(tuple(["University", pool.submit(get_university, soup).result()]))
        data.append(tuple(["Degree", pool.submit(get_degree, soup).result()]))
        data.append(tuple(["Grade", pool.submit(get_grade, soup).result()]))
        data.append(tuple(["Province", pool.submit(get_province, soup).result()]))
        data.append(tuple(["Public/Private", pool.submit(get_public, soup).result()]))
        data.append(tuple(["Modality", pool.submit(get_modality, soup).result()]))
        data.append(tuple(["Web", pool.submit(get_web, soup).result()]))
        data.append(tuple(["Duration", pool.submit(get_duration, soup).result()]))
        data.append(tuple(["Cost", pool.submit(get_cost, soup).result()]))
        
        #university = pool.submit(get_university, soup)
        #degree = pool.submit(get_degree, soup)
        #grade = pool.submit(get_grade, soup)
        #province = pool.submit(get_province, soup)
        #modality = pool.submit(get_modality, soup)
        #public = pool.submit(get_public, soup)
        #web = pool.submit(get_web, soup)
        #duration = pool.submit(get_duration, soup)
        #cost = pool.submit(get_cost, soup)

    #data = [("University", university.result()),("Degree", degree.result()),("Grade",grade.result()),
    #            ("Province", province.result()),("Modality", modality.result()),
    #            ("Public/Private", public.result()), ("Web", web.result()), ("Duration", duration.result()),
    #            ("Cost",cost.result())]
    data1 = []
    length = len(data[0][1])
    for element in data:
        if len(element[1]) == length:
            data1.append(element)
    df = pd.DataFrame.from_dict(dict(data1))

    return df


CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 14.1 µs


In [229]:
%%time
aggregate2(soup)

CPU times: user 189 ms, sys: 3.75 ms, total: 193 ms
Wall time: 194 ms


Unnamed: 0,University,Degree,Grade,Province,Public/Private,Modality,Web,Duration,Cost
0,Universidad de Granada,Doble Grado en Matemáticas + Física,13164,Granada,Universidad Pública,Presencial,http://fciencias.ugr.es/index.php,"5,0 años",no disponible
1,Universidad de Sevilla,Doble Grado en Física + Matemáticas,13090,Sevilla,Universidad Pública,Presencial,http://fisica.us.es,"5,0 años",909 €
2,Universidad de Granada,Grado en Medicina,12750,Granada,Universidad Pública,Presencial,http://www.ugr.es/~facmed/,"6,0 años",757 €
3,Universidad de Granada,Grado en Traducción e Interpretación (Inglés),12750,Granada,Universidad Pública,Presencial,http://www.ugr.es/~factrad/,"4,0 años",757 €
4,Universidad de Sevilla,Grado en Medicina,12725,Sevilla,Universidad Pública,Presencial,https://medicina.us.es/,"6,0 años",757 €
5,Universidad de Sevilla,Doble Grado en Fisioterapia + Ciencias de la A...,12677,Sevilla,Universidad Pública,Presencial,http://www.fefp.us.es/,"5,0 años",no disponible
6,Universidad de Málaga,Grado en Medicina,12631,Málaga,Universidad Pública,Presencial,http://www.uma.es/facultad-de-medicina/,"6,0 años",757 €
7,Universidad de Córdoba,Grado en Medicina,12623,Córdoba,Universidad Pública,Presencial,http://www.uco.es/medicina/,"6,0 años",757 €
8,Universidad de Sevilla,Grado en Biomedicina Básica y Experimental,12562,Sevilla,Universidad Pública,Presencial,https://medicina.us.es/,"4,0 años",757 €
9,Universidad de Cádiz,Grado en Medicina,12483,Cádiz,Universidad Pública,Presencial,http://www.uca.es/centro/1C04,"6,0 años",757 €


In [230]:
%%time 
aggregate(soup)

CPU times: user 142 ms, sys: 3.2 ms, total: 145 ms
Wall time: 144 ms


Unnamed: 0,University,Degree,Grade,Province,Modality,Public/Private,Web,Duration,Cost
0,Universidad de Granada,Doble Grado en Matemáticas + Física,13164,Granada,Presencial,Universidad Pública,http://fciencias.ugr.es/index.php,"5,0 años",no disponible
1,Universidad de Sevilla,Doble Grado en Física + Matemáticas,13090,Sevilla,Presencial,Universidad Pública,http://fisica.us.es,"5,0 años",909 €
2,Universidad de Granada,Grado en Medicina,12750,Granada,Presencial,Universidad Pública,http://www.ugr.es/~facmed/,"6,0 años",757 €
3,Universidad de Granada,Grado en Traducción e Interpretación (Inglés),12750,Granada,Presencial,Universidad Pública,http://www.ugr.es/~factrad/,"4,0 años",757 €
4,Universidad de Sevilla,Grado en Medicina,12725,Sevilla,Presencial,Universidad Pública,https://medicina.us.es/,"6,0 años",757 €
5,Universidad de Sevilla,Doble Grado en Fisioterapia + Ciencias de la A...,12677,Sevilla,Presencial,Universidad Pública,http://www.fefp.us.es/,"5,0 años",no disponible
6,Universidad de Málaga,Grado en Medicina,12631,Málaga,Presencial,Universidad Pública,http://www.uma.es/facultad-de-medicina/,"6,0 años",757 €
7,Universidad de Córdoba,Grado en Medicina,12623,Córdoba,Presencial,Universidad Pública,http://www.uco.es/medicina/,"6,0 años",757 €
8,Universidad de Sevilla,Grado en Biomedicina Básica y Experimental,12562,Sevilla,Presencial,Universidad Pública,https://medicina.us.es/,"4,0 años",757 €
9,Universidad de Cádiz,Grado en Medicina,12483,Cádiz,Presencial,Universidad Pública,http://www.uca.es/centro/1C04,"6,0 años",757 €
