# Importación de módulos

In [8]:
import asyncio
import os
import re
import spacy
import sys

from aiofile import AIOFile
from aiohttp import ClientSession
from bs4 import Comment
from itertools import tee

from src.data_obtaining.pdf_processing import convert_pdf_to_txt
from src.data_obtaining.webpage_interaction import download_document_and_save
from src.data_obtaining.webpage_interaction import get_webpage_as_bs
from src.processing.constants import BASE_DIRECTORY
from src.processing.constants import ACCEPTED_LANGUAGES
from src.processing.utils.utils import split_doc_into_sentences
from src.preparation.models.obtained_text import ObtainedText
from src.preparation.models.sentence_pair import SentencePair
from src.preparation.data_access.obtained_text_da import ObtainedTextDA

# Constantes

In [2]:
'''URLS = ['http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18']'''

'''URLS = ['http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=11&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=11&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=11&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=13&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=13&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=9&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=9&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=9&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=5&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=9&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=17&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=9&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=9&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=4&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=18&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=7&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=20&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=4&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=6&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=13&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=6&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10']'''
        

URLS = ['http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=4&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=6&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=2&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10',
        'http://www.perueduca.pe/materiales-educativos?p_p_id=ResourcesPublicPE_WAR_ResourcesPublicPEportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_jspPage=%2Farea.jsp&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_q=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_areaId=13&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_tiporec=6&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_gradoId=3&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_filter=&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_inicio=1&_ResourcesPublicPE_WAR_ResourcesPublicPEportlet_fin=10']

In [3]:
LEVELS = {
    'Educación Primaria': '1',
    'Educación Secundaria': '2'
}

In [4]:
CATEGORIES = {
    'Historia, Geografía y Economía': 'Historia, Geografía y Economía',
    'CTA': 'CTA',
    'Ciencia y tecnología': 'CTA',
    'Ciencia y Ambiente': 'CTA',
    'Arte': 'Historia, Geografía y Economía',
    'Comunicación': 'Comunicación',
    'Personal Social': 'Historia, Geografía y Economía',
    'Educación Religiosa': 'Historia, Geografía y Economía',
}

In [5]:
PROJECT_PATH = '/home/hans/Documentos/Tesis_Chatbot'

# Funciones extra

In [6]:
def is_accepted_document(document_language: str, document_category: str, document_name: str) -> bool:
    """
    This function checks if the document to download has the correct language and belongs to the correct category
    
    Parameters:
    document_language (str): Language the document was written in.
    document_category (str): Teaching category the document belongs to.
    document_name (str): The name of the document
    
    Returns:
    Boolean: If the document is accepted or not
    """
    accepted_language = 'ES (Español)'
    accepted_areas = ['Formación Ciudadana y Cívica', 
                  'Comunicación', 
                  'CTA',
                  'Ciencia y tecnología',
                  'Ciencia y Ambiente',
                  'Personal Social',
                  'Historia, Geografía y Economía',
                  'Ciencias sociales']
    texts_to_omit = ['unuchamanta-kawsaq.pdf',
                     'sankenapatotantsi-ashaninka.pdf',
                     'phuyu-raqeaq-atuq.pdf',
                     'arojeitake-ashitarori-impoiji-amabentakotirori-aipatsite.pdf',
                     'kantilarya-q-uchuna.pdf',
                     'qamaqimpi-wallatampi.pdf',
                     'siku-phusa.pdf',
                     'yana-t-urumanta-t-iqirunacha.pdf',
                     'zampona-siku.pdf',
                     'papa-mamanchikmanta.pdf',
                     'santurantikuy.pdf',
                     'awila-mikayla.pdf',
                     'Ruta_Shawi_Inicial.pdf',
                     'Ruta_aimara_inicial.pdf',
                     'Ruta_ashaninka_Inicial.pdf',
                     'Ruta_castellano_amazonico_inicial.pdf',
                     'Ruta_Shawi_ComMat_IIIciclo.pdf',
                     'Ruta_Shawi_ComMat_IVciclo.pdf',
                     'Ruta_Shawi_ComMat_Vciclo.pdf',
                     'Ruta_Shipibo_ComMat_IVciclo.pdf'
                     'Ruta_Shipibo_ComMat_Vciclo.pdf',
                     'Ruta_aimara_ComMat_III_ciclo.pdf',
                     'Ruta_ashaninka_ComMat_IV_ciclo.pdf',
                     'Ruta_ashaninka_ComMat_Vciclo.pdf',
                     'Ruta_castellano_amazonico_IVciclo.pdf''Ruta_quechuaCollao_ComMat_IVciclo.pdf',
                    'Ruta_quechuaCollao_ComMat_Vciclo.pdf',
                    'cartilla_ashaninka.pdf',
                    'cartilla_matsigenka.pdf',
                    'guia_alfabeto_cashinahua.pdf',
                    'guia_alfabeto_ese_eja.pdf',
                    'guia_alfabeto_jaqaru.pdf',
                    'guia_alfabeto_kichwa.pdf',
                    'guia_alfabeto_matsigenka.pdf',
                    'guia_alfabeto_murui_muinani.pdf',
                    'guia_alfabeto_secoya.pdf',
                    'guia_alfabeto_sharanahua.pdf',
                    'guia_alfabeto_wampis.pdf',
                    'guia_alfabeto_yine.pdf',
                    'manual_ashaninka.pdf',
                    'manual_awajun.pdf',
                    'manual_ese_eja.pdf',
                    'manual_harakbut.pdf',
                    'manual_jaqaru.pdf',
                    'manual_kakataibo.pdf',
                    'manual_quechua_collao.pdf',
                    'manual_shawi.pdf',
                    'manual_shipibo.pdf',
                    'manual_yanesha.pdf',
                    'manual_yine.pdf',
                    'pegantatsiri-ogotaige-omagaro-2.pdf',
                    'Text Ruta_quechuaCollao_ComMat_IVciclo.pdf',
                    'Text Ruta_quechuaCollao_ComMat_Vciclo.pdf',
                    'mi-cuaderno-autoaprendizaje-ps-4']
    print(document_language == accepted_language, document_category in accepted_areas, not document_name in texts_to_omit, document_name)
    return document_language == accepted_language and document_category in accepted_areas and not document_name in texts_to_omit

In [7]:
async def obtain_and_save_document(session: ClientSession, url: str) -> None:
    """
    This function looks for the link to download a document in "perueduca.com"

    Parameters:
    session (ClientSession): The async session used to obtain the webpage
    url (str): The url of the webpage to convert

    Returns:
    None
    """
    try:
        soup = await get_webpage_as_bs(session, url)
        document_information_table = soup.find('div', id='ficha-catalogo').find_all('tr')
        document_language = document_information_table[7].find('td').text.strip()
        document_category = document_information_table[3].find('td').text.strip()
        download_link = soup.find('a', id='hf_iframe')['href']
        document_name = download_link.split('/')[-1]
        if (is_accepted_document(document_language, document_category, document_name)):
            document_level = LEVELS[document_information_table[1].find('td').text.strip()]
            document_folder_category = CATEGORIES[document_category]
            storage_directory = BASE_DIRECTORY + f'/data/raw/pdf/{document_level}/{document_folder_category}/{document_name}'
            if '.pdf' in download_link:
                await download_document_and_save(session, download_link, storage_directory)
                print(f'Document {document_name} downloaded successfully')
    except Exception as e:
        raise e

In [8]:
async def obtain_texts_from_perueduca_webpage(session: ClientSession, url: str) -> None:
    """
    This function traverses through an entire web page of peru educa, searching for documents to download.
    It's a recursive function.
    
    This function begins with obtaining the webpage as a beautiful soup object, then, it proceeds to find
    all the available document sections in this page so it can download them. After that, it goes to the
    next page and does the same process again.

    Parameters:
    session (ClientSession): The async session used to obtain the webpage
    url (str): The url of the webpage to convert

    Returns:
    None
    """
    try:
        soup = await get_webpage_as_bs(session, url)
        texts_information = soup.find_all('div', class_='box-result')
        if (len(texts_information) == 0): # No more texts in this page. Extraction process finished
            return
        else: # More texts still available
            for text in texts_information:
                text_link = text.find('a')['href']
                await obtain_and_save_document(session, text_link)
            # There are still more pages to look documents into
            next_page_button = soup.find('li', class_='next')
            next_page_url = next_page_button.find('a')['href']
            await obtain_texts_from_perueduca_webpage(session, next_page_url)
    except Exception as e:
        raise e


In [9]:
async def convert_pdfs_to_txt_files() -> None:
    """
    This function scans all the files we just downloaded and converts them to a .txt format for easier processing
    """
    levels = ['1', '2']
    categories = ['Historia, Geografía y Economía',
                    'CTA',
                    'Comunicación']
    for level in levels:
        for category in categories:
            documents = [file_name for file_name in os.listdir(BASE_DIRECTORY + f'/data/raw/pdf/{level}/{category}') if '.pdf' in file_name]
            documents.sort()
            for document in documents:
                try:
                    pdf_path = f'{BASE_DIRECTORY}/data/raw/pdf/{level}/{category}/{document}'
                    save_dir = f'{BASE_DIRECTORY}/data/raw/txt/{level}/{category}/{document}'.replace('.pdf', '.txt')
                    await convert_pdf_to_txt(pdf_path, save_dir)
                    print(f'Text {document} converted from pdf to txt')
                except Exception as e:
                    print(str(e))
                    continue

In [5]:
def clean_sentence(sentence:  str) -> str:
    '''
    This function cleans a sentence.
    
    Parameters:
    sentence(str): A dirty sentence.
    
    Returns:
    str: A clean sentence.
    '''
    aux = sentence.strip()
    aux = re.sub('\n+', ' ', aux)
    return aux

In [12]:
async def save_text_files_to_database() -> None:
    '''
    This will save all texts to a sqlite database.
    '''         
    obtained_text_da = ObtainedTextDA()
    levels = ['1', '2']
    categories = ['Historia, Geografía y Economía',
                    'CTA',
                    'Comunicación']
    obtained_texts = obtained_text_da.select_all()
    nlp = spacy.load(ACCEPTED_LANGUAGES['es'], disable=['parser', 'tagger', 'ner'])
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    nlp.max_length = 5555827
    try:
        for level in levels:
            for category in categories:
                documents = [file_name for file_name in os.listdir(BASE_DIRECTORY + f'/data/raw/txt/{level}/{category}') if '.txt' in file_name]
                documents.sort()
                for document in documents:
                    already_found = [ot for ot in obtained_texts if ot.filename == document]
                    if len(already_found) != 0:
                        print(f'{document} has already been processed.')
                    else:
                        async with AIOFile(f'{BASE_DIRECTORY}/data/raw/txt/{level}/{category}/{document}', 'r') as f:
                            text = await f.read()
                            if len(text) > 0:
                                # Get sentence pairs
                                doc = nlp(text)
                                sentences = split_doc_into_sentences(doc)
                                prev, cur = tee(sentences)
                                next(cur, None)
                                sentence_pair = [SentencePair(first=clean_sentence(prev.text),
                                                              second=clean_sentence(cur.text))
                                                 for prev, cur in zip(prev, cur)]
                                # Create the text object
                                obtained_text = ObtainedText(text=text, grade=level, filename=document, category=category)
                                obtained_text.sentence_pair = sentence_pair
                                obtained_text_da.insert(obtained_text)
                                print(f'Text {document} saved to dabatase.')
                            else:
                                print(f'Text {document} was empty.')
    except Exception as e:
        print(str(e))

In [13]:
async def main() -> None:
    """
    The main function obtains the documents as pdfs and then converts them to text file format
    """
    '''try:
        session = ClientSession()
        for url in URLS:
            await obtain_texts_from_perueduca_webpage(session, url)
        await session.close()
    except Exception as e:
        await session.close()'''
    await convert_pdfs_to_txt_files()
    await save_text_files_to_database()