# Imports and configuration

In [1]:
import asyncio
import os
import pandas as pd
from typing import Dict
import sys

from aiofile import AIOFile

from src.preparation.data_access import obtain_text_file_as_string
from src.processing.coh_metrix_indices.descriptive_indices import get_paragraph_count_from_text
from src.processing.coh_metrix_indices.descriptive_indices import get_sentence_count_from_text
from src.processing.coh_metrix_indices.descriptive_indices import get_word_count_from_text
from src.processing.coh_metrix_indices.descriptive_indices import get_mean_of_length_of_paragraphs
from src.processing.coh_metrix_indices.descriptive_indices import get_std_of_length_of_paragraphs
from src.processing.coh_metrix_indices.descriptive_indices import get_mean_of_length_of_sentences
from src.processing.coh_metrix_indices.descriptive_indices import get_std_of_length_of_sentences
from src.processing.coh_metrix_indices.descriptive_indices import get_mean_of_length_of_words
from src.processing.coh_metrix_indices.descriptive_indices import get_std_of_length_of_words

# Constants

In [2]:
PROJECT_PATH = '/home/hans/Documentos/Tesis_Chatbot'

# Helper functions

In [3]:
async def txt_file_to_pandas_dataframe_row_only_difficulty(txt_path: str, difficulty: int) -> Dict:
    """
    This function finds all the coh-metrix indices we are going to use

    Parameters:
    txt_path(str): The path of the file to obtain the indices from
    difficulty(int): The difficulty of the text
    """
    text = await obtain_text_file_as_string(txt_path)
    return {
        'DESPC': get_paragraph_count_from_text(text), 
        'DESSC': get_sentence_count_from_text(text), 
        'DESWC': get_word_count_from_text(text), 
        'DESPL': get_mean_of_length_of_paragraphs(text), 
        'DESPLd': get_std_of_length_of_paragraphs(text),
        'DESSL': get_mean_of_length_of_sentences(text), 
        'DESSLd': get_std_of_length_of_sentences(text),
        'DESWLlt': get_mean_of_length_of_words(text), 
        'DESWLltd': get_std_of_length_of_words(text), 
        'Difficulty': difficulty
    }

In [4]:
async def convert_txt_files_to_dataframes() -> pd.DataFrame:
    """
    This function scans all the files we just downloaded and converts them to a .txt format for easier processing
    """
    levels = ['1', '2', '3']
    categories = ['Historia, Geografía y Economía',
                    'CTA',
                    'Arte',
                    'Comunicación',
                    'Personal social',
                    'Formación Ciudadana y Cívica',
                    'Educación Fisica',
                    'Educación Religiosa']
    dataframe_columns = ['DESPC', 'DESSC', 'DESWC', 'DESPL', 'DESPLd', 'DESSL', 'DESSLd', 'DESWLlt', 'DESWLltd', 'Difficulty']
    dataframe_only_difficulty = pd.DataFrame(columns=dataframe_columns)
    for level in levels:
        for category in categories:
            documents = [file_name for file_name in os.listdir(PROJECT_PATH + f'/data/raw/txt/{level}/{category}') if '.txt' in file_name]
            documents.sort()
            for document in documents:
                try:
                    txt_path = f'{PROJECT_PATH}/data/raw/txt/{level}/{category}/{document}'
                    row = await txt_file_to_pandas_dataframe_row_only_difficulty(txt_path, level)
                    dataframe_only_difficulty = dataframe_only_difficulty.append(row, ignore_index=True)
                    print(f'Text {document} converted  txt to dataframe row')
                except Exception as e:
                    print(str(e))
                    continue
    dataframe_only_difficulty.to_csv(f'{PROJECT_PATH}/data/processed/data_with_difficulty.csv')

In [5]:
async def main():
    await convert_txt_files_to_dataframes()

In [6]:
await main()

Text Ruta_castellano_inicial.txt converted  txt to dataframe row
Text abecedario.txt converted  txt to dataframe row
Text aguita-vida.txt converted  txt to dataframe row
Text antologia-de-poesia-para-ninos-y-ninas.txt converted  txt to dataframe row
Text aprendemos-jugando-2018-4.txt converted  txt to dataframe row
Text aprendemos-jugando-2018-5.txt converted  txt to dataframe row
Text capacidades_comunica_ama.txt converted  txt to dataframe row
Text capacidades_comunica_ande.txt converted  txt to dataframe row
Text cartilla-acogida-comunidad-educativa-reinicio-clases.txt converted  txt to dataframe row
Text con-los-ojos-abierto-yo-escucho.txt converted  txt to dataframe row
Text con-los-ojos-abiertos-yo-veo.txt converted  txt to dataframe row
Text cuidados-con-amor.txt converted  txt to dataframe row
Text desarrollo-expresion-diversos-lenguajes.txt converted  txt to dataframe row
Text el-muneco-de-brea.txt converted  txt to dataframe row
Text el-muqui.txt converted  txt to dataframe r

Text kit-evaluacion-registro-logros-2do-primaria-comunicacion-2trimestre-proceso.txt converted  txt to dataframe row
Text kit-evaluacion-registro-logros-2do-primaria-comunicacion-3trimestre-salida.txt converted  txt to dataframe row
Text kit-evaluacion-registro-logros-4to-primaria-comunicacion-1trimestre-entrada.txt converted  txt to dataframe row
Text kit-evaluacion-registro-logros-4to-primaria-comunicacion-2trimestre-proceso.txt converted  txt to dataframe row
Text libros-biblioteca-aula-primaria-2do-grado-catalogo.txt converted  txt to dataframe row
Text libros-biblioteca-aula-primaria-3er-grado-catalogo.txt converted  txt to dataframe row
Text maestros-padres-mejores-aliados-aprendizaje-ciclo3.txt converted  txt to dataframe row
Text maestros-padres-mejores-aliados-aprendizaje-ciclo4.txt converted  txt to dataframe row
Text maestros-padres-mejores-aliados-aprendizaje-ciclo5.txt converted  txt to dataframe row
Text mama_ven.txt converted  txt to dataframe row
Text manual-familia-2.t

Text territorio-cultura-unidad-3-portafolio-2-avanzado.txt converted  txt to dataframe row
Text territorio-cultura-unidad-3-portafolio-4-avanzado.txt converted  txt to dataframe row
Text territorio-cultura-unidad-3-texto-2-avanzado.txt converted  txt to dataframe row
Text territorio-cultura-unidad-3-texto-3-avanzado.txt converted  txt to dataframe row
Text territorio-cultura-unidad-3-texto-4-avanzado.txt converted  txt to dataframe row
Text trabajo-emprendimiento-unidad-4-portafolio-2-avanzado.txt converted  txt to dataframe row
Text trabajo-emprendimiento-unidad-4-portafolio-3-avanzado.txt converted  txt to dataframe row
Text trabajo-emprendimiento-unidad-4-texto-2-avanzado.txt converted  txt to dataframe row
Text trabajo-emprendimiento-unidad-4-texto-3-avanzado.txt converted  txt to dataframe row
None
