# Initial setup

In [1]:
import asyncio
import os
import sys

from aiofile import AIOFile
from aiohttp import ClientSession
from bs4 import Comment

from src.data_obtaining.pdf_processing import convert_pdf_to_txt
from src.data_obtaining.webpage_interaction import download_document_and_save
from src.data_obtaining.webpage_interaction import get_webpage_as_bs

# Constants

In [2]:
URLS = ['http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18']

In [3]:
LEVELS = {
    'Educación Inicial': '1',
    'Educación Primaria': '2',
    'Educación Secundaria': '3'
}

In [4]:
CATEGORIES = {
    'Historia, Geografía y Economía': 'Historia, Geografía y Economía',
    'CTA': 'CTA',
    'Ciencia y tecnología': 'CTA',
    'Arte': 'Arte',
    'Comunicación': 'Comunicación',
    'Personal social': 'Personal social',
    'Formación Ciudadana y Cívica': 'Formación Ciudadana y Cívica',
    'Educación Fisica': 'Educación Fisica',
    'Educación Religiosa': 'Educación Religiosa'
}

In [5]:
PROJECT_PATH = '/home/hans/Documentos/Tesis_Chatbot'

# Helper functions

In [6]:
def is_accepted_document(document_language: str, document_category: str, document_name: str) -> bool:
    """
    This function checks if the document to download has the correct language and belongs to the correct category
    
    Parameters:
    document_language (str): Language the document was written in.
    document_category (str): Teaching category the document belongs to.
    document_name (str): The name of the document
    
    Returns:
    Boolean: If the document is accepted or not
    """
    accepted_language = 'ES (Español)'
    accepted_areas = ['Formación Ciudadana y Cívica', 
                  'Arte', 
                  'Comunicación', 
                  'CTA',
                  'Ciencia y tecnología',
                  'Personal social'
                  'Educación Fisica', 
                  'Historia, Geografía y Economía',
                  'Educación Religiosa']
    texts_to_omit = ['unuchamanta-kawsaq.pdf',
                 'sankenapatotantsi-ashaninka.pdf',
                 'phuyu-raqeaq-atuq.pdf',
                 'arojeitake-ashitarori-impoiji-amabentakotirori-aipatsite.pdf']
    return document_language == accepted_language and document_category in accepted_areas and not document_name in texts_to_omit

In [7]:
async def obtain_and_save_document(session: ClientSession, url: str) -> None:
    """
    This function looks for the link to download a document in "perueduca.com"

    Parameters:
    session (ClientSession): The async session used to obtain the webpage
    url (str): The url of the webpage to convert

    Returns:
    None
    """
    try:
        soup = await get_webpage_as_bs(session, url)
        document_information_table = soup.find('div', id='ficha-catalogo').find_all('tr')
        document_language = document_information_table[7].find('td').text.strip()
        document_category = document_information_table[3].find('td').text.strip()
        download_link = soup.find('a', id='hf_iframe')['href']
        document_name = download_link.split('/')[-1]
        if (is_accepted_document(document_language, document_category, document_name)):
            document_level = LEVELS[document_information_table[1].find('td').text.strip()]
            document_folder_category = CATEGORIES[document_category]
            storage_directory = PROJECT_PATH + f'/data/raw/pdf/{document_level}/{document_folder_category}/{document_name}'
            if '.pdf' in download_link:
                await download_document_and_save(session, download_link, storage_directory)
                print(f'Document {document_name} downloaded successfully')
    except Exception as e:
        raise e

In [8]:
async def obtain_texts_from_perueduca_webpage(session: ClientSession, url: str) -> None:
    """
    This function traverses through an entire web page of peru educa, searching for documents to download.
    It's a recursive function.
    
    This function begins with obtaining the webpage as a beautiful soup object, then, it proceeds to find
    all the available document sections in this page so it can download them. After that, it goes to the
    next page and does the same process again.

    Parameters:
    session (ClientSession): The async session used to obtain the webpage
    url (str): The url of the webpage to convert

    Returns:
    None
    """
    try:
        soup = await get_webpage_as_bs(session, url)
        texts_information = soup.find_all('div', class_='box-result')
        if (len(texts_information) == 0): # No more texts in this page. Extraction process finished
            return
        else: # More texts still available
            for text in texts_information:
                text_link = text.find('a')['href']
                await obtain_and_save_document(session, text_link)
            # There are still more pages to look documents into
            next_page_button = soup.find('li', class_='next')
            next_page_url = next_page_button.find('a')['href']
            await obtain_texts_from_perueduca_webpage(session, next_page_url)
    except Exception as e:
        raise e


In [9]:
async def convert_pdfs_to_txt_files() -> None:
    """
    This function scans all the files we just downloaded and converts them to a .txt format for easier processing
    """
    levels = ['1', '2', '3']
    categories = ['Historia, Geografía y Economía',
                    'CTA',
                    'Arte',
                    'Comunicación',
                    'Personal social',
                    'Formación Ciudadana y Cívica',
                    'Educación Fisica',
                    'Educación Religiosa']
    for level in levels:
        for category in categories:
            documents = [file_name for file_name in os.listdir(PROJECT_PATH + f'/data/raw/pdf/{level}/{category}') if '.pdf' in file_name]
            documents.sort()
            for document in documents:
                try:
                    pdf_path = f'{PROJECT_PATH}/data/raw/pdf/{level}/{category}/{document}'
                    save_dir = f'{PROJECT_PATH}/data/raw/txt/{level}/{category}/{document}'.replace('.pdf', '.txt')
                    await convert_pdf_to_txt(pdf_path, save_dir)
                    print(f'Text {document} converted from pdf to txt')
                except Exception as e:
                    print(str(e))
                    continue

In [16]:
async def main() -> None:
    """
    The main function obtains the documents as pdfs and then converts them to text file format
    """
    try:
        session = ClientSession()
        for url in URLS:
            await obtain_texts_from_perueduca_webpage(session, url)
        await session.close()
    except Exception as e:
        await session.close()
    await convert_pdfs_to_txt_files()

In [18]:
await main()

Document juegos_para_ensenar_a_pensar_1.pdf downloaded successfully
Document juegos_para_ensenar_a_pensar_2.pdf downloaded successfully
Document f_rutas_ini_bil_02.pdf downloaded successfully
Document f_rutas_ini_bil_01.pdf downloaded successfully
Document arguedas.pdf downloaded successfully
Document clorinda_matto.pdf downloaded successfully
Document cesar_vallejo.pdf downloaded successfully
Document maria_granda.pdf downloaded successfully
Document augusto_leguia.pdf downloaded successfully
Document fernando_belaunde.pdf downloaded successfully
Document guia_produccion_audiolibros.pdf downloaded successfully
Document toribio_mogrovejo.pdf downloaded successfully
Document santos_chocano.pdf downloaded successfully
Document santa_rosa.pdf downloaded successfully
Document san_martin_porres.pdf downloaded successfully
Document ricardo_palma.pdf downloaded successfully
Document mariategui.pdf downloaded successfully
Document instructivowikipedia.pdf downloaded successfully
Document instr

Document programa-curricular-educacion-primaria.pdf downloaded successfully
Document taller-de-psicomotricidad-07.pdf downloaded successfully
Document yana-t-urumanta-t-iqirunacha.pdf downloaded successfully
Document proyectos-aprendizaje-rutas-aprendizaje.pdf downloaded successfully
Document zampona-siku.pdf downloaded successfully
Document maestros-padres-mejores-aliados-aprendizaje-ciclo3.pdf downloaded successfully
Document maniji-satiposati.pdf downloaded successfully
Document jugando-con-las-palabras.pdf downloaded successfully
Document cuidados-con-amor.pdf downloaded successfully
Document el-valor-educativo-de-la-observacion-del-desarrollo-del-nino.pdf downloaded successfully
Document entorno-educativo-calidad-educacion-inicial-guia-docentes.pdf downloaded successfully
Document laminas-para-padres-de-actividades-cotidianas.pdf downloaded successfully
Document qamaqimpi-wallatampi.pdf downloaded successfully
Document uno-dos-tres-escalones-de-taquile.pdf downloaded successfully


Text pegantatsiri-ogotaige-omagaro-2.pdf converted from pdf to txt
Text programa-curricular-educacion-primaria.pdf converted from pdf to txt
Text proyectos-aprendizaje-rutas-aprendizaje.pdf converted from pdf to txt
Text situaciones-comunicativas-comunicacion-texto-1-inicial.pdf converted from pdf to txt
Text situaciones-para-aprender-construir-portafolio-2-intermedio.pdf converted from pdf to txt
Text situaciones-para-aprender-construir-texto-2-intermedio.pdf converted from pdf to txt
Text situaciones-para-aprender-construir-texto-3-intermedio.pdf converted from pdf to txt
Text san_martin_porres.pdf converted from pdf to txt
Text santa_rosa.pdf converted from pdf to txt
Text toribio_mogrovejo.pdf converted from pdf to txt
Text augusto_leguia.pdf converted from pdf to txt
Text carta_jamaica.pdf converted from pdf to txt
Text fernando_belaunde.pdf converted from pdf to txt
Text haya_torre.pdf converted from pdf to txt
Text historia_geografia.pdf converted from pdf to txt
Text manuel_val