In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug  1 10:43:34 2021

@author: IgVinçon
"""

############################ IMPORT LIBRARIES ################################
import json

import requests
from bs4 import BeautifulSoup, Comment
from tqdm.notebook import tqdm

import scrap

In [None]:
# Some of the following lines are surrounded by the comment hashtag, these
# contain code to test everything is working as intended. Uncomment them in
# case of testing yourself or checking the line of reasoning.
############################# ANII SCRAPPING #################################
# ANII's website (https://anii.org.uy/proyectos/) uses JavaScript to show more
# projects. By heading to the website > right click > 'inspect' > 'network' and
# filtering by XHR there is a "getMoreProjects.php" file, its' response
# contains the projects data to scrape.

# Assign "getMoreProjects.php" URL and pass it to make_request. 
# The response is a json dictionary, parse it as such.
anii_url = 'https://anii.org.uy/app/ajax/frontend/getMoreProjects.php'
r, sleep = scrap.make_request(anii_url)
content = r.json()
##############################################################################
# Check it is a dictionary and look at its keys:
#print(type(content))
#print(content.keys())
# The 'html' key has the data to extract, 'numRows' key provides the current
# rows, while 'total' key refers to the total rows in the whole table.
##############################################################################

# Create a payload (to pass into make_request) identical to getMoreProjects.php
# request in order to emulate it. Passing the value from content['total'] to
# the 'count' key gets the whole table.
data = {
    'offset': '0',
    'count': content['total'],
    'order': 'anioconvocatoria',
    'criteria': 'DESC',
    'qry': '',
    'departamento': '',
    'estado': '',
    'fase': '',
    'instrumento': ''
}
# The other keys seem to be filters that can be manipulated if desired.

# Make (POST) request (with data as payload) and parse the response.
r, sleep = scrap.make_request(anii_url, sleep, data)
content = r.json()
content = BeautifulSoup(content['html'], 'lxml')

# Create a dictionary to store the data when scrapped.
anii_dict = {
    'instrumento': [],       
    'beneficiario': [],
    'departamento': [],
    'subsidio': [],
    'fecha de inicio': [],
    'duracion': [],
    'codigo': [],            
    'proyecto': [],
    'area': [],
    'anio': [],
    'fase_estado': [],       
    'area de proy.': [],
    'sector': [],
#    'resumen': [],   Avoid it in this version, has many edge cases to account for.
    'enlace': []    
}

# Since the data being extracted is in a table, each row is encapsulated
# between <tr></tr>, which in turn has encapsulated each cell between <a></a>.
rows = content.select('tr')
##############################################################################
# Testing the total number of rows is correct.
#total_rows = int(data['count'])
#print(f"Nº of rows is correct: {len(rows) == total_rows}")
##############################################################################

# Create a list of keys to pass to table_scrapper. Afterwards, clean the last
# row of the table since it is a placeholder (confirming before removing).
keys = [
    'proyecto',
    'beneficiario',
    'area',
    'fase_estado',
    'anio'
]
anii_dict = scrap.table_scrapper(rows, 'a', anii_dict, keys, {'enlace': 0})
keys.append('enlace')
for k in keys:
##############################################################################
#    print(anii_dict[k][-1])
    anii_dict[k] = anii_dict[k][:-1]
#    print(anii_dict[k][-1])

# Dictionary to keep track of URLs scrapped, if there was redirection and,
# in case of errors, easly accessing said URL.
#testing_dict = {
#    'case_num': [],
#    'correct_url': [],
#    'url': []
#}
##############################################################################
######                                                                 #######
###### WARNING!!! Executing this loop will take between 24-48hs aprox. #######
###### To execute in parts, isolate the code in a notebook cell, pass  #######
###### ranges (e.g. [:anii_dict['enlace']//4] and so on) and save to   #######
###### hardrive intermediate results with json module or some other.   #######
###### For instance:                                                   #######
#with open("anii.json", "w") as fp:       # To save intermediate results
#    json.dump(anii_dict, fp)
#with open("anii.json", "r") as fp:       # To load intermediate results
#    anii_dict = json.load(fp)

# Create list of keywords to pass to linear_search in the for loop.
keywords = [k for k in anii_dict.keys() if k not in keys]
for i in tqdm(range(len(anii_dict['enlace']))):
    # Make request and parse it.
    r, sleep = scrap.make_request(anii_dict['enlace'][i], sleep)
    content = BeautifulSoup(r.content, 'lxml')
##############################################################################
    #testing_dict['case_num'].append(i)
    #testing_dict['correct_url'].append(r.url == anii_dict['enlace'][i])
    #testing_dict['url'].append(anii_dict['enlace'][i])
##############################################################################    

    # If the URL requested is the correct one, start scrapping.
    if r.url == anii_dict['enlace'][i]:
        # Select the data (two halves of a block, a comment and abstract).
        left_half = (
            content
            .find_all('ul', class_ = 'content_details')[0]
            .select('li')
        )
        right_half = (
            content
            .find_all('ul', class_ = 'content_details')[1]
            .select('li')
        )           
        comment = (
            content
            .find_all('ul', class_ = 'content_details')[0]
            .find(text = lambda text: isinstance(text, Comment))
        )
        comment = BeautifulSoup(comment, 'lxml')
        comment = comment.select('li')
        #abstract = content.find('div', class_ = 'content_info')
        #abstract = scrap.extract_text(abstract)#.split(maxsplit = 2)[-1]
        
        # Scrape and store the data using linear_search while taking into
        # account some edge cases. 
        for keyword in keywords: # Buscar una manera de simplificar y que no quede tres veces código similar (es posible??)
            #print(f"keyword is: {keyword}")
            if keyword in keywords[:3]:
                k, v = scrap.linear_search(left_half, keyword, True)
            elif keyword in keywords[3:6]:
                k, v = (
                    scrap.linear_search(right_half, keyword, True)
                    if keyword != 'fecha de inicio'
                    else scrap.linear_search(right_half, keyword, True, True)
                )
            else:
                k, v = (
                    scrap.linear_search(comment, keyword, True)
                    if keyword != 'area_de_proy.'
                    else scrap.linear_search(comment, keyword, True, True)
                )
            #print(f"key: {k}, value: {v}")
            anii_dict[k].append(v)
    
    # If there was redirection, append 'n/a'.
    else:
        for keyword in keywords:
              anii_dict[keyword].append('n/a')

##############################################################################
# Check percentage of webpages scrapped and, in case of errors, the last page
# that was accessed.
#true_count = sum(x == True for x in testing_dict['correct_url'])
#percentage = true_count/len(testing_dict['correct_url'])
#print(f"Percentage of retrieved url: {percentage:.2%}")
#print(f"Case number of last URL retrieved: {testing_dict['case_num'][-1]}")
#for_loop_result = testing_dict['case_num'][-1] == len(anii_dict['id'])
#print(f"Have all URL been accessed? {for_loop_result}")
#if not(for_loop_result):
    #print(f"Error found in: {testing_df['url'][-1]}")
##############################################################################
              
# Create DataFrame from the dictionary and save it as .csv 
#anii_df = pd.DataFrame(data = anii_dict)
#anii_df.to_csv('anii.csv')

In [None]:
############################## SNI SCRAPPING #################################
# Make request to SNI website and parse the content.
sni_url = 'https://sni.org.uy/buscador'
r, sleep = scrap.make_request(sni_url)
content = BeautifulSoup(r.content, 'lxml')
# Select the table rows ('tr'), except the first which is empty.
rows = content.select('tr')[1:]

# Create a SNI dictionary with the appropiate columns from the table.
sni_dict = {
    'nombre': [],
    'nivel': [],
    'categoria': [],
    'area': [],
    'subarea': []
}
##############################################################################
# Find the total rows number according to the site
#total_rows = content.find('div', class_ = 'encontados_buscador')
#total_rows = int(total_rows.text.split(maxsplit = 2)[1])
# And check total it is equal to the length of the rows variable
#print(f"Nº of rows is correct: {len(rows) == total_rows}")
##############################################################################
# Create a list with the dictionary keys to pass to table_scrapper function.
# In this case, each cell is between a 'td' tag and there is no URL to extract.
keys = [k for k in sni_dict.keys()]
sni_dict = scrap.table_scrapper(rows, 'td', sni_dict, keys)

# Create DataFrame from the dictionary and save it as .csv 
sni_df = pd.DataFrame(data = sni_dict)
sni_df.to_csv('sni.csv')

In [None]:
############################## EI SCRAPPING ##################################
# All three URLs from EI have the same format, thus only one dictionary
# (with the same columns) is needed.
ei_dict = {
    'grupo': [],
    'convocatoria': [],
    'periodo': [],
    'url_grupo': [],
    'url_convo': []
}
# Create a list with the dictionary keys to pass to table_scrapper function.
href = {'url_grupo': 0,
        'url_convo': 2}
keywords = ['descripcion', 'objetivos', 'servicios involucrados', 'responsables']
keys = [k for k in ei_dict.keys() if k not in href.keys()]
for keyword in keywords:
    ei_dict[keyword] = []

# Since all URLs are identical, create a list with them to loop later.
base_url = 'https://ei.udelar.edu.uy'
ei_urls = [
    (
    r'https://ei.udelar.edu.uy/programa-financiamiento/'
    r'programa-de-apoyo-centros-interdisciplinarios-de-la-udelar'
    ), (
    r'https://ei.udelar.edu.uy/programa-financiamiento/'
    r'programa-nucleos-interdisciplinarios'
    ), (
    r'https://ei.udelar.edu.uy/programa-financiamiento/'
    r'semillero-de-iniciativas-interdisciplinarias'
    )
]

sleep = 0
for i in tqdm(range(len(ei_urls))):
    # Create a copy of the URL in order to update it
    url_copy = ei_urls[i]
    # NOTE: The progress bar for each URL always shows total web pages - 1.
    # Thus, for the 3 pages for "Centros", the progress bar shows 2 (and so on).
    
    for _ in tqdm(scrap.generator()):
        r, sleep = scrap.make_request(url_copy, sleep)
        content = BeautifulSoup(r.content, 'lxml')
        # Try to scrape and update URL to continue scrapping                  
        try:
            # Select the table rows ('tr'), except the first.
            rows = content.select('tr')[1:]
            # Find the total rows number according to the site
        
            # In this case, each cell is between a 'td' tag
            ei_dict = scrap.table_scrapper(
                rows, 'td', ei_dict, keys, href, base_url
            )
            # After extraction, get the next page,
            next_page = content.main.section.div.select(
                'li.pager__item.pager__item--next > a'
            )[0]
            # extract its URL and update the make_request URL 
            next_page = scrap.extract_href(next_page)
            url_copy = ei_urls[i] + next_page                       
    
        # When there is nothing to scrape (a table or next page) finish the loop                   
        except:
            break
            
for i in tqdm(range(len(ei_dict['url_grupo']))):
    r, sleep = scrap.make_request(ei_dict['url_grupo'][i], sleep)
    content = BeautifulSoup(r.content, 'lxml')
    div = (
        content
        .main
        .section
        .div
        .find_all('div')[3]
        .div
        .article
        .div.find_all('div')
    )
    for keyword in keywords:
        k, v = scrap.linear_search(div, keyword)
        ei_dict[k].append(v)    
        
# Create DataFrame from the dictionary and save it as .csv 
#ei_df = pd.DataFrame(data = ei_dict)
#ei_df.to_csv('ei.csv')

In [None]:
############################# CSIC SCRAPPING #################################
url = 'https://www.csic.edu.uy/proyectos-financiados'
base_url = 'https://www.csic.edu.uy'

csic_dict = {
    'proyecto': [],
    'programa': [], #Se podría extraer URL, pero lleva al programa con todas las convocatorias, no de un año especifico.
    'ano': [],
    'area proyecto': [],
    'resumen': [],
    'responsables': [],
    'monto total': [],
    'enlace': []
}

url_copy = url
sleep = 0
for _ in tqdm(scrap.generator()):
    r, sleep = scrap.make_request(url_copy, sleep)
    content = BeautifulSoup(r.content, 'lxml')
    
    try:
        hrefs = content.find_all('div', class_ = 'fusion-button-wrapper')
        for href in hrefs:
            scrapped_url = base_url + scrap.extract_href(href.find('a'))
            csic_dict['enlace'].append(scrapped_url)
        next_page = content.select('li.next > a')[0]
        next_page = scrap.extract_href(next_page)
        url_copy = base_url + next_page
    
    except:
        break
##############################################################################    
#total_entries = content.select('section > div > div.view-footer')[0]
#total_entries = scrap.extract_text(total_entries)
#total_entries = int(total_entries.split()[-1])
#for_loop_result = total_entries == len(csic_dict['enlace'])
#print(f"Scrapped URLs and total_entries are the same? {for_loop_result}")
##############################################################################

keywords = [
    k for k in csic_dict.keys() 
    if k not in ['proyecto', 'enlace', 'resumen']
]
    
for i in tqdm(range(len(csic_dict['enlace']))):
    r, sleep = scrap.make_request(csic_dict['enlace'][i], sleep)
    content = BeautifulSoup(r.content, 'lxml')
    
    title = scrap.extract_text(content.select('h1')[0])
    div = content.section.article.select('div')
    abstract = scrap.extract_text(
        content.section.article.select('div.field-item.even')[3]
    )
    
    for keyword in keywords:
        k, v = scrap.linear_search(div, keyword + ':')
        k = k.split(':')[0]
        csic_dict[k].append(v)
        
    csic_dict['proyecto'].append(title)
    csic_dict['resumen'].append(abstract)
    
##############################################################################    
#for k in csic_dict.keys():
#    length = len(csic_dict[k])
#    result = total_entries == length
#    print(f"length of {k} and total_entries are the same? {result}")
##############################################################################
# Create DataFrame from the dictionary and save it as .csv 
#csic_df = pd.DataFrame(data = csic_dict)
#csic_df.to_csv('csic.csv')