# Imports and configuration

In [1]:
import asyncio
import os
import pandas as pd
import time
from typing import Dict
import sys


from src.processing.constants import BASE_DIRECTORY
from src.preparation.models.obtained_text import ObtainedText
from src.preparation.models.descriptive_index import DescriptiveIndex
from src.preparation.models.connective_index import ConnectiveIndex
from src.preparation.models.lexical_diversity_index import LexicalDiversityIndex
from src.preparation.models.readability_index import ReadabilityIndex
from src.preparation.models.referential_cohesion_index import ReferentialCohesionIndex
from src.preparation.models.syntactic_complexity_index import SyntacticComplexityIndex
from src.preparation.models.syntactic_pattern_density_index import SyntacticPatternDensityIndex
from src.preparation.models.word_information_index import WordInformationIndex
from src.preparation.data_access.obtained_text_da import ObtainedTextDA
from src.processing.text_complexity_analyzer import TextComplexityAnalyzer

  util.warn(


# Calculate text complexity indices

In [2]:
tca = TextComplexityAnalyzer('es')
da = ObtainedTextDA()
obtained_texts = da.select_all()

for ot in obtained_texts:
    if ot.descriptive_index is not None and ot.word_information_index is not None and ot.syntactic_pattern_density_index is not None and ot.syntactic_complexity_index is not None and ot.connective_index is not None and ot.lexical_diversity_index is not None and ot.readability_index is not None and ot.referential_cohesion_index:
        print(f'{ot.filename} Ya ha sido procesado anteriormente.')
    else:
        try:
            start = time.time()
            descriptive_row = tca.calculate_descriptive_indices_for_one_text(ot.text)
            word_count = descriptive_row['DESWC']
            mean_words_per_sentence = descriptive_row['DESSL']
            mean_syllables_per_word = descriptive_row['DESWLsy']
            ot.descriptive_index = DescriptiveIndex(**descriptive_row)
            ot.word_information_index = WordInformationIndex(**tca.calculate_word_information_indices_for_one_text(ot.text, word_count=word_count))
            ot.syntactic_pattern_density_index = SyntacticPatternDensityIndex(**tca.calculate_syntactic_pattern_density_indices_for_one_text(ot.text, word_count=word_count))
            ot.syntactic_complexity_index = SyntacticComplexityIndex(**tca.calculate_syntactic_complexity_indices_for_one_text(ot.text))
            ot.connective_index = ConnectiveIndex(**tca.calculate_connective_indices_for_one_text(ot.text, word_count=word_count))
            ot.lexical_diversity_index = LexicalDiversityIndex(**tca.calculate_lexical_diversity_density_indices_for_one_text(ot.text))
            ot.readability_index = ReadabilityIndex(**tca.calculate_readability_indices_for_one_text(ot.text, mean_words_per_sentence=mean_words_per_sentence, mean_syllables_per_word=mean_syllables_per_word))
            ot.referential_cohesion_index = ReferentialCohesionIndex(**tca.calculate_referential_cohesion_indices_for_one_text(text=ot.text))
            end = time.time()
            da.update(ot) # Save the indices for the current record       
            print(f'Tiempo demorado para {ot.filename}: {end - start} segundos.')
        except Exception as e:
            print(f'{ot.filename} no pudo ser procesado debido a un error en el procesamiento.')
            print(str(e))
            continue

Tiempo demorado para atahualpa.txt: 10.968142747879028 segundos.
Tiempo demorado para avelino_caceres.txt: 10.779173374176025 segundos.
Tiempo demorado para cartilla_fenomeno_nino.txt: 6.239735126495361 segundos.
Tiempo demorado para cartilla_inundaciones.txt: 6.104085445404053 segundos.
Tiempo demorado para enrique_meiggs.txt: 11.803784608840942 segundos.
Tiempo demorado para epopeya_arica.txt: 11.596657037734985 segundos.
Tiempo demorado para francisco_bolognesi.txt: 10.316344261169434 segundos.
Tiempo demorado para francisco_orellana.txt: 12.81145167350769 segundos.
Tiempo demorado para francisco_toledo.txt: 12.537973880767822 segundos.
Tiempo demorado para guaman_poma.txt: 8.024553298950195 segundos.
Tiempo demorado para hipolito_unanue.txt: 8.98119044303894 segundos.
Tiempo demorado para ines_huaylas.txt: 5.445565938949585 segundos.
Tiempo demorado para jorge_basadre.txt: 9.3884859085083 segundos.
Tiempo demorado para jorge_chavez.txt: 7.572685718536377 segundos.
Tiempo demorado p

Tiempo demorado para historia_geografia.txt: 139.98936223983765 segundos.
Tiempo demorado para la_cuenca_del_amazonas.txt: 4.920232534408569 segundos.
Tiempo demorado para la_estructura_demografica_de_la_poblacion.txt: 5.112022161483765 segundos.
Tiempo demorado para la_organizacion_de_la_economia_en_el_imperio_inca.txt: 5.01183819770813 segundos.
Tiempo demorado para las_corrientes_marinas.txt: 5.394022703170776 segundos.
Tiempo demorado para las_proyecciones_cartograficas.txt: 6.3001792430877686 segundos.
Tiempo demorado para manuel_valdes.txt: 10.6248459815979 segundos.
Tiempo demorado para manuela_saenz.txt: 13.303950071334839 segundos.
Tiempo demorado para maria_granda.txt: 8.999867916107178 segundos.
Tiempo demorado para mesoamericanos_mayas_aztecas.txt: 4.839453458786011 segundos.
Tiempo demorado para polis_griega.txt: 4.7935097217559814 segundos.
Tiempo demorado para pueblos_mediterraneo_griegos.txt: 4.873147487640381 segundos.
Tiempo demorado para agua_suelo.txt: 55.0444509983

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


charles_dickens.txt no pudo ser procesado debido a un error en el procesamiento.
division by zero
Tiempo demorado para clorinda_matto.txt: 13.05292558670044 segundos.
Tiempo demorado para comunicacion-secundaria-rural-cuaderno-1.txt: 353.23198795318604 segundos.
Tiempo demorado para corsario_negro.txt: 406.71372866630554 segundos.
Tiempo demorado para cuaderno-nivelacion-competencias-com-vi.txt: 1182.1220614910126 segundos.
Tiempo demorado para cumbres_borrascosas.txt: 2194.599269628525 segundos.
Tiempo demorado para estacion-de-las-letras.txt: 12.187481164932251 segundos.
Tiempo demorado para extrano_caso_doctor_hyde.txt: 163.5951006412506 segundos.
Tiempo demorado para frankenstein.txt: 820.0782778263092 segundos.
Tiempo demorado para hora-literaria-1-m1.txt: 17.33555579185486 segundos.
Tiempo demorado para hora-literaria-1-m2.txt: 16.380207061767578 segundos.
Tiempo demorado para hora-literaria-1-m3.txt: 13.107982397079468 segundos.
Tiempo demorado para hora-literaria-1-m4.txt: 14.2

# Visualizing  the data obtained

In [3]:
da = ObtainedTextDA()
obtained_texts = da.select_all_as_dataframe()
obtained_texts

Unnamed: 0,CNCADC,CNCAdd,CNCAll,CNCCaus,CNCLogic,CNCTemp,CRFANP1,CRFANPa,CRFAO1,CRFAOa,...,WRDPRP2p,WRDPRP2s,WRDPRP3p,WRDPRP3s,WRDVERB,category,cluster_grade,filename,grade,id
0,5.595524,0.532907,56.754596,17.319478,30.109246,3.197442,0.099526,0.079630,0.236967,0.148887,...,0.000000,0.000000,2.398082,10.125233,110.578204,"Historia, Geografía y Economía",,atahualpa.txt,1.0,1.0
1,4.526167,0.848656,56.294201,13.861386,33.663366,3.394625,0.159722,0.119540,0.291667,0.197797,...,0.000000,0.848656,2.545969,8.769448,94.766620,"Historia, Geografía y Economía",,avelino_caceres.txt,1.0,2.0
2,1.737619,0.000000,59.079062,11.294526,40.834057,5.212858,0.026316,0.019822,0.197368,0.150034,...,0.000000,5.212858,1.737619,9.556907,149.435274,"Historia, Geografía y Economía",,cartilla_fenomeno_nino.txt,1.0,3.0
3,1.765225,1.765225,64.430715,12.356575,44.130627,4.413063,0.035714,0.039216,0.166667,0.142577,...,0.000000,9.708738,0.000000,9.708738,151.809356,"Historia, Geografía y Economía",,cartilla_inundaciones.txt,1.0,4.0
4,5.965697,0.248571,52.945563,13.422819,30.325628,2.982849,0.151515,0.171011,0.430303,0.273384,...,0.000000,0.000000,1.739995,10.688541,93.711161,"Historia, Geografía y Economía",,enrique_meiggs.txt,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,10.168675,1.012048,60.096386,15.084337,24.626506,9.204819,0.153777,0.113482,0.229317,0.130330,...,2.650602,15.180723,2.409639,20.192771,118.265060,Comunicación,,seis_personajes_busca.txt,2.0,181.0
181,8.489796,0.593692,56.653061,15.762523,25.751391,6.055659,0.072211,0.051115,0.170490,0.105130,...,0.044527,2.077922,1.128015,9.261596,108.897959,Comunicación,,viaje_centro_tierra.txt,2.0,182.0
182,8.669041,0.941620,74.623352,19.456726,40.274707,5.281258,0.328918,0.254774,0.546726,0.382347,...,0.194465,0.020470,3.531073,13.745599,114.120200,Comunicación,,viajes_gulliver.txt,2.0,183.0
183,4.147928,0.218312,82.958564,22.355150,52.045584,4.191591,0.287313,0.192939,0.375622,0.225790,...,0.480286,2.357770,2.969043,22.922761,102.606645,Comunicación,,vida_lazarillo_tormes.txt,2.0,184.0
