In [34]:
#import required libraries
import textstat # https://pypi.org/project/textstat/
import pandas as pd

In [35]:
def read_file(file_name):
    with open(file_name) as f:
        lines = f.readlines()
    return lines

In [5]:
complex = read_file('wikilarge.train.complex')
simple = read_file('wikilarge.train.simple')

In [3]:
def reading_score_formula(sentence):
    flesch_reading_score = textstat.flesch_reading_ease(sentence)
    difficult_words = textstat.difficult_words(sentence)
    gunning_score = textstat.gunning_fog(sentence)
    readability_index = textstat.automated_readability_index(sentence)
    coleman_index = textstat.coleman_liau_index(sentence)
    linsear_formula = textstat.linsear_write_formula(sentence)
    dale_chall = textstat.dale_chall_readability_score(sentence)
    text_standard = textstat.text_standard(sentence, float_output=False)
    mcalpine_score = textstat.mcalpine_eflaw(sentence)
    reading_time = textstat.reading_time(sentence, ms_per_char=14.69) # Assumes 14.69ms per character.
    result = {"sentence":sentence,
              "flesch_reading_score":flesch_reading_score,
              "difficult_words":difficult_words,
              "gunning_score":gunning_score,
              "readability_index":readability_index,
              "coleman_index":coleman_index,
              "linsear_formula":linsear_formula,
              "dale_chall":dale_chall,
              "text_standard":text_standard,
              "mcalpine_score":mcalpine_score,
              "reading_time":reading_time}
    return result

In [36]:
def aggregates_averages_scores(sentence):
    sylabble_count = textstat.syllable_count(sentence)
    lexicon_count = textstat.lexicon_count(sentence, removepunct=True)
    sentence_count = textstat.sentence_count(sentence)
    char_count = textstat.char_count(sentence, ignore_spaces=True)
    letter_count = textstat.letter_count(sentence, ignore_spaces=True)
    polysyllab_count = textstat.polysyllabcount(sentence)
    monosyllab_count = textstat.monosyllabcount(sentence)
    result = {"sentence":sentence,
              "sylabble_count":sylabble_count,
              "lexicon_count":lexicon_count,
              "sentence_count":sentence_count,
              "char_count":char_count,
              "letter_count":letter_count,
              "polysyllab_count":polysyllab_count,
              "monosyllab_count":monosyllab_count}
    return result

In [37]:
result_list = []
for i in complex:
    result = reading_score_formula(i)
    result_list.append(result)
reading_score_complex = pd.DataFrame.from_dict(result_list)
reading_score_complex

Unnamed: 0,sentence,flesch_reading_score,difficult_words,gunning_score,readability_index,coleman_index,linsear_formula,dale_chall,text_standard,mcalpine_score,reading_time
0,There is manuscript evidence that Austen conti...,58.29,7,20.46,19.0,8.77,26.5,12.59,14th and 15th grade,52.0,2.54
1,"In a remarkable comparative analysis , Mandaea...",33.24,9,17.92,18.7,18.10,16.5,15.96,17th and 18th grade,26.0,1.94
2,"Before Persephone was released to Hermes , who...",47.80,9,20.00,21.6,10.69,27.0,11.15,10th and 11th grade,57.0,2.86
3,Cogeneration plants are commonly found in dist...,22.08,14,20.30,25.2,19.74,25.5,14.60,19th and 20th grade,38.0,3.06
4,"Geneva ( , ; , ; , ; ; ) is the second-most-po...",40.69,5,10.62,19.9,15.66,21.0,9.03,10th and 11th grade,31.0,2.09
...,...,...,...,...,...,...,...,...,...,...,...
296397,These structures form at the high-pressure dep...,69.11,4,9.71,12.6,12.36,12.0,8.73,8th and 9th grade,26.0,1.45
296398,Saint-Maixant is a commune in the Gironde depa...,66.74,3,8.28,13.3,15.30,9.0,11.57,8th and 9th grade,19.0,1.15
296399,The Hollywood Walk of Fame is a series of side...,24.79,12,24.49,25.9,12.84,34.0,12.10,12th and 13th grade,65.0,3.50
296400,Boulogne-Billancourt is the most populous subu...,36.28,7,16.09,14.8,15.31,13.5,12.42,12th and 13th grade,26.0,1.53


In [38]:
result_list = []
for i in complex:
    result = aggregates_averages_scores(i)
    result_list.append(result)
aggregates_averages_scores_complex = pd.DataFrame.from_dict(result_list)
aggregates_averages_scores_complex

Unnamed: 0,sentence,sylabble_count,lexicon_count,sentence_count,char_count,letter_count,polysyllab_count,monosyllab_count
0,There is manuscript evidence that Austen conti...,51,38,1,173,167,5,30
1,"In a remarkable comparative analysis , Mandaea...",37,21,1,132,128,5,11
2,"Before Persephone was released to Hermes , who...",56,40,1,195,189,4,28
3,Cogeneration plants are commonly found in dist...,57,32,1,208,201,6,16
4,"Geneva ( , ; , ; , ; ; ) is the second-most-po...",37,22,1,142,125,3,14
...,...,...,...,...,...,...,...,...
296397,These structures form at the high-pressure dep...,27,19,1,99,97,2,13
296398,Saint-Maixant is a commune in the Gironde depa...,20,13,1,78,75,3,9
296399,The Hollywood Walk of Fame is a series of side...,74,46,1,238,232,9,29
296400,Boulogne-Billancourt is the most populous subu...,33,18,1,104,102,4,11


In [39]:
result_list = []
for i in simple:
    result = reading_score_formula(i)
    result_list.append(result)
reading_score_simple = pd.DataFrame.from_dict(result_list)
reading_score_simple

Unnamed: 0,sentence,flesch_reading_score,difficult_words,gunning_score,readability_index,coleman_index,linsear_formula,dale_chall,text_standard,mcalpine_score,reading_time
0,There is some proof that Austen continued to w...,87.55,5,9.29,8.6,7.94,10.75,9.92,8th and 9th grade,23.0,2.32
1,Mandaean scholar Säve-Söderberg showed that Ma...,72.16,5,8.90,13.9,15.08,8.50,16.27,8th and 9th grade,19.0,1.37
2,When Demeter went to the Underworld to rescue ...,76.05,6,10.15,10.4,8.88,12.50,7.73,7th and 8th grade,30.5,2.76
3,Cogeneration plants are commonly found in dist...,21.06,14,20.47,25.0,18.98,26.00,14.36,18th and 19th grade,40.0,3.08
4,The city 's main newspaper is the Tribune de G...,74.19,3,7.03,6.6,6.78,8.75,12.23,6th and 7th grade,20.0,1.84
...,...,...,...,...,...,...,...,...,...,...,...
296397,These structures form at the high-pressure dep...,69.11,4,9.71,13.1,12.65,12.50,8.73,12th and 13th grade,25.0,1.48
296398,"Saint-Maixant , Gironde is a commune . It is f...",86.20,5,6.10,7.6,9.43,6.00,10.17,5th and 6th grade,16.0,1.56
296399,There are many stars on the sidewalk .\n,106.67,0,2.80,2.9,4.94,3.00,6.24,2nd and 3rd grade,10.0,0.46
296400,"Consequently , Boulogne-Billancourt is one of ...",0.08,6,18.13,18.3,21.97,11.00,13.44,12th and 13th grade,17.0,1.26


In [40]:
result_list = []
for i in simple:
    result = aggregates_averages_scores(i)
    result_list.append(result)
aggregates_averages_scores_simple = pd.DataFrame.from_dict(result_list)
aggregates_averages_scores_simple

Unnamed: 0,sentence,sylabble_count,lexicon_count,sentence_count,char_count,letter_count,polysyllab_count,monosyllab_count
0,There is some proof that Austen continued to w...,43,35,2,158,154,2,29
1,Mandaean scholar Säve-Söderberg showed that Ma...,23,16,1,93,90,1,10
2,When Demeter went to the Underworld to rescue ...,54,41,2,188,185,3,31
3,Cogeneration plants are commonly found in dist...,58,33,1,210,203,6,17
4,The city 's main newspaper is the Tribune de G...,39,28,2,125,119,4,21
...,...,...,...,...,...,...,...,...
296397,These structures form at the high-pressure dep...,27,19,1,101,98,2,13
296398,"Saint-Maixant , Gironde is a commune . It is f...",28,21,2,106,102,2,16
296399,There are many stars on the sidewalk .\n,8,7,1,31,30,0,6
296400,"Consequently , Boulogne-Billancourt is one of ...",27,12,1,86,83,4,6


In [51]:
complex_df = pd.concat([reading_score_complex, aggregates_averages_scores_complex, on="sentence", how="outer")
simple_df = pd.merge(reading_score_simple, aggregates_averages_scores_simple, on="sentence", how="outer")
result = pd.concat([df1, df4], axis=1)

In [53]:
complex_df

Unnamed: 0,sentence,flesch_reading_score,difficult_words,gunning_score,readability_index,coleman_index,linsear_formula,dale_chall,text_standard,mcalpine_score,reading_time,sylabble_count,lexicon_count,sentence_count,char_count,letter_count,polysyllab_count,monosyllab_count
0,There is manuscript evidence that Austen conti...,58.29,7,20.46,19.0,8.77,26.5,12.59,14th and 15th grade,52.0,2.54,51,38,1,173,167,5,30
1,"In a remarkable comparative analysis , Mandaea...",33.24,9,17.92,18.7,18.10,16.5,15.96,17th and 18th grade,26.0,1.94,37,21,1,132,128,5,11
2,"Before Persephone was released to Hermes , who...",47.80,9,20.00,21.6,10.69,27.0,11.15,10th and 11th grade,57.0,2.86,56,40,1,195,189,4,28
3,Cogeneration plants are commonly found in dist...,22.08,14,20.30,25.2,19.74,25.5,14.60,19th and 20th grade,38.0,3.06,57,32,1,208,201,6,16
4,"Geneva ( , ; , ; , ; ; ) is the second-most-po...",40.69,5,10.62,19.9,15.66,21.0,9.03,10th and 11th grade,31.0,2.09,37,22,1,142,125,3,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616371,"Tijs Michiel Verwest was born in Breda , North...",65.73,5,5.60,10.5,11.13,9.0,13.35,7th and 8th grade,18.0,1.09,21,14,1,74,70,1,8
616372,"Operationally , the cyclone was first classifi...",24.78,7,19.83,16.9,16.13,18.0,9.94,16th and 17th grade,28.0,1.82,39,21,1,124,121,6,12
616373,These structures form at the high-pressure dep...,69.11,4,9.71,12.6,12.36,12.0,8.73,8th and 9th grade,26.0,1.45,27,19,1,99,97,2,13
616374,Boulogne-Billancourt is the most populous subu...,36.28,7,16.09,14.8,15.31,13.5,12.42,12th and 13th grade,26.0,1.53,33,18,1,104,102,4,11


In [32]:
# df1.subtract(df2)
complex_clean = complex_df.drop(['sentence', 'text_standard'], axis=1) df.drop(['B', 'C'], axis=1)
simple_clean = simple_df.drop(["sentence",'text_standard'], axis=1)
complex_clean

Unnamed: 0,flesch_reading_score,difficult_words,gunning_score,readability_index,coleman_index,linsear_formula,dale_chall,mcalpine_score,reading_time,sylabble_count,lexicon_count,sentence_count,char_count,letter_count,polysyllab_count,monosyllab_count
0,58.29,7,20.46,19.0,8.77,26.5,12.59,52.0,2.54,51,38,1,173,167,5,30
1,33.24,9,17.92,18.7,18.10,16.5,15.96,26.0,1.94,37,21,1,132,128,5,11
2,47.80,9,20.00,21.6,10.69,27.0,11.15,57.0,2.86,56,40,1,195,189,4,28
3,22.08,14,20.30,25.2,19.74,25.5,14.60,38.0,3.06,57,32,1,208,201,6,16
4,40.69,5,10.62,19.9,15.66,21.0,9.03,31.0,2.09,37,22,1,142,125,3,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616371,65.73,5,5.60,10.5,11.13,9.0,13.35,18.0,1.09,21,14,1,74,70,1,8
616372,24.78,7,19.83,16.9,16.13,18.0,9.94,28.0,1.82,39,21,1,124,121,6,12
616373,69.11,4,9.71,12.6,12.36,12.0,8.73,26.0,1.45,27,19,1,99,97,2,13
616374,36.28,7,16.09,14.8,15.31,13.5,12.42,26.0,1.53,33,18,1,104,102,4,11
