# Initial setup

In [1]:
import asyncio
import os
import sys

from aiofile import AIOFile
from aiohttp import ClientSession
from bs4 import Comment

from src.data_obtaining.pdf_processing import convert_pdf_to_txt
from src.data_obtaining.webpage_interaction import download_document_and_save
from src.data_obtaining.webpage_interaction import get_webpage_as_bs

# Constantes

In [2]:
URLS = ['http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18']

In [3]:
LEVELS = {
    'Educación Inicial': '1',
    'Educación Primaria': '2',
    'Educación Secundaria': '3'
}

In [4]:
CATEGORIES = {
    'Historia, Geografía y Economía': 'Historia, Geografía y Economía',
    'CTA': 'CTA',
    'Ciencia y tecnología': 'CTA',
    'Arte': 'Arte',
    'Comunicación': 'Comunicación',
    'Personal social': 'Personal social',
    'Formación Ciudadana y Cívica': 'Formación Ciudadana y Cívica',
    'Educación Física': 'Educación Física',
}

In [5]:
PROJECT_PATH = '/home/hans/Documentos/Tesis_Chatbot'

# Helper functions

In [6]:
def is_accepted_document(document_language: str, document_category):
    """
    This function checks if the document to download has the correct language and belongs to the correct category
    
    Parameters:
    document_language (str): Language the document was written in.
    document_category (str): Teaching category the document belongs to.
    """
    accepted_language = 'ES (Español)'
    accepted_areas = ['Formación Ciudadana y Cívica', 
                  'Arte', 
                  'Comunicación', 
                  'CTA',
                  'Ciencia y tecnología',
                  'Personal social'
                  'Educación Física', 
                  'Historia, Geografía y Economía']
    return document_language == accepted_language and document_category in accepted_areas

In [7]:
async def obtain_and_save_document(session: ClientSession, url: str) -> None:
    """
    This function looks for the link to download a document in "perueduca.com"

    Parameters:
    session (ClientSession): The async session used to obtain the webpage
    url (str): The url of the webpage to convert

    Returns:
    None
    """
    try:
        soup = await get_webpage_as_bs(session, url)
        document_information_table = soup.find('div', id='ficha-catalogo').find_all('tr')
        document_language = document_information_table[7].find('td').text.strip()
        document_category = document_information_table[3].find('td').text.strip()
        if (is_accepted_document(document_language, document_category)):
            document_level = LEVELS[document_information_table[1].find('td').text.strip()]
            download_link = soup.find('a', id='hf_iframe')['href']
            document_name = download_link.split('/')[-1]
            document_folder_category = CATEGORIES[document_category]
            storage_directory = PROJECT_PATH + f'/data/raw/pdf/{document_level}/{document_folder_category}/{document_name}'
            if '.pdf' in download_link:
                await download_document_and_save(session, download_link, storage_directory)
                print(f'Document {document_name} downloaded successfully')
    except Exception as e:
        raise e

In [8]:
async def obtain_texts_from_perueduca_webpage(session: ClientSession, url: str) -> None:
    """
    This function traverses through an entire web page of peru educa, searching for documents to download.
    It's a recursive function.
    
    This function begins with obtaining the webpage as a beautiful soup object, then, it proceeds to find
    all the available document sections in this page so it can download them. After that, it goes to the
    next page and does the same process again.

    Parameters:
    session (ClientSession): The async session used to obtain the webpage
    url (str): The url of the webpage to convert

    Returns:
    None
    """
    try:
        soup = await get_webpage_as_bs(session, url)
        texts_information = soup.find_all('div', class_='box-result')
        if (len(texts_information) == 0): # No more texts in this page. Extraction process finished
            return
        else: # More texts still available
            for text in texts_information:
                text_link = text.find('a')['href']
                await obtain_and_save_document(session, text_link)
            # There are still more pages to look documents into
            next_page_button = soup.find('li', class_='next')
            next_page_url = next_page_button.find('a')['href']
            await obtain_texts_from_perueduca_webpage(session, next_page_url)
    except Exception as e:
        raise e


In [9]:
async def convert_pdfs_to_txt_files() -> None:
    """
    This function scans all the files we just downloaded and converts them to a .txt format for easier processing
    """
    levels = ['1', '2', '3']
    categories = ['Historia, Geografía y Economía',
                    'CTA',
                    'Arte',
                    'Comunicación',
                    'Personal social',
                    'Formación Ciudadana y Cívica',
                    'Educación Física']
    for level in levels:
        for category in categories:
            documents = [file_name for file_name in os.listdir(PROJECT_PATH + f'/data/raw/pdf/{level}/{category}') if '.pdf' in file_name]
            documents.sort()
            for document in documents:
                try:
                    pdf_path = f'{PROJECT_PATH}/data/raw/pdf/{level}/{category}/{document}'
                    save_dir = f'{PROJECT_PATH}/data/raw/txt/{level}/{category}/{document}'.replace('.pdf', '.txt')
                    await convert_pdf_to_txt(pdf_path, save_dir)
                    print(f'Text {document} converted from pdf to txt')
                except Exception as e:
                    print(str(e))
                    continue

In [10]:
async def main() -> None:
    """
    The main function obtains the documents as pdfs and then converts them to text file format
    """
    try:
        session = ClientSession()
        for url in URLS:
            await obtain_texts_from_perueduca_webpage(session, url)
        await session.close()
    except Exception as e:
        await session.close()
    await convert_pdfs_to_txt_files()

In [12]:
await main()

Text abecedario.pdf converted from pdf to txt
Text antologia-de-poesia-para-ninos-y-ninas.pdf converted from pdf to txt
Text aprendemos-jugando-2018-4.pdf converted from pdf to txt
Text aprendemos-jugando-2018-5.pdf converted from pdf to txt
Text arojeitake-ashitarori-impoiji-amabentakotirori-aipatsite.pdf converted from pdf to txt
Text capacidades_comunica_ama.pdf converted from pdf to txt
Text con-los-ojos-abierto-yo-escucho.pdf converted from pdf to txt
Text con-los-ojos-abiertos-yo-veo.pdf converted from pdf to txt
Text el-muneco-de-brea.pdf converted from pdf to txt
Text el-viaje-al-cielo.pdf converted from pdf to txt
Text el-zorro-que-devoro-la-nube.pdf converted from pdf to txt
Text f_rutas_ini_bil_02.pdf converted from pdf to txt
Text juegos_para_ensenar_a_pensar_1.pdf converted from pdf to txt
Text juegos_para_ensenar_a_pensar_2.pdf converted from pdf to txt
Text kantilarya-q-uchuna.pdf converted from pdf to txt
Text la-fiesta-de-la-candelaria.pdf converted from pdf to txt
Tex

Text romeo_julieta.pdf converted from pdf to txt
Text santos_chocano.pdf converted from pdf to txt
Text seis_personajes_busca.pdf converted from pdf to txt
Text tres_mosqueteros.pdf converted from pdf to txt
Text viaje_centro_tierra.pdf converted from pdf to txt
Text viajes_gulliver.pdf converted from pdf to txt
Text vida_lazarillo_tormes.pdf converted from pdf to txt
Text vuelta_mundo_80.pdf converted from pdf to txt
Text trabajo-emprendimiento-unidad-4-portafolio-3-avanzado.pdf converted from pdf to txt
