In [1]:
import pandas as pd
import textstat
import os
import time

# Calculate Readability Score from clean text

In [2]:
def get_readability_score(df, scoring_func):
    """
    Calculate the readability score for the clean text
    
    df: dataframe of articles
    scoring_func: the readability score to calculate
    """

    return df['clean_text'].apply(scoring_func)

In [3]:
def add_readability_score(df):
    """
    Add readabilty score features
    
    df: dataframe of articles
    """
    
    flesch_score = get_readability_score(df, textstat.flesch_reading_ease)
    df['flesch'] = flesch_score.values
    
    flesch_kincaid_level = get_readability_score(df, textstat.flesch_kincaid_grade)
    df['flesch_kincaid'] = flesch_kincaid_level.values

    smog_index = get_readability_score(df, textstat.smog_index)
    df['smog_index'] = smog_index.values

    coleman_liau = get_readability_score(df, textstat.coleman_liau_index)
    df['coleman_liau'] = coleman_liau.values

    automated_readability = get_readability_score(df, textstat.automated_readability_index)
    df['automated_readability'] = automated_readability.values

    difficult_words = get_readability_score(df, textstat.difficult_words)
    df['difficult_words'] = difficult_words.values

    dale_chall = get_readability_score(df, textstat.dale_chall_readability_score)
    df['dale_chall'] = dale_chall.values

    linsear = get_readability_score(df, textstat.linsear_write_formula)
    df['linsear'] = linsear.values

    gunning_fog = get_readability_score(df, textstat.gunning_fog)
    df['gunning_fog'] = gunning_fog.values
    
    return df

In [4]:
datasets = ["(Imbalance)", "(Balance)"]

for dataset in datasets:
    df = pd.read_csv(f'../Data/dataset_text_structure_{dataset}.csv', keep_default_na=False)
    df.head()
    
    if not os.path.exists(f'../Data/dataset_with_readability_{dataset}.csv'):
        print(f"Start Calculating Readability Scores for {dataset} ....")
        start_time = time.time()
        df = add_readability_score(df)
        df.to_csv(f'../Data/dataset_with_readability_{dataset}.csv', index=False)
        end_time = time.time()
        
        print("CSV file created")
        elapsed_time = end_time - start_time
        print(f"Calculation time: {elapsed_time/60:.2f} minutes")
    else:
        print("CSV file alreday exists")

CSV file alreday exists
CSV file alreday exists


# Readability Score Explanation

## Flesch Reading Score
- It measures the readability of a text based on sentence length and syllable count
- A higher score indicates easier readability
    - 90-100: Very easy
    - 60-70: Standard
    - 0-30: Very difficult
- It is calculated using the formula:
    $$206.835 - 1.015\left(\frac{\text{total words}}{\text{total sentences}}\right) - 84.6\left(\frac{\text{total syllables}}{\text{total words}}\right)$$

## Flesch-Kincaid Grade Level
- It translates the Flesch Reading Ease Score into a U.S. school grade level
- A higher score indicates that the text is appropriate for higher grade levels
- It is calculated using the formula:
    $$0.39\left(\frac{\text{total words}}{\text{total sentences}}\right) + 11.8\left(\frac{\text{total syllables}}{\text{total words}}\right) - 15.59$$

## Smog Index
- It estimates the years of education needed to understand a piece of writing
- It focuses on polysyllabic words (words with three or more syllables)
- It is calculated using the formula:
    $$1.0430\sqrt{\text{number of polysyllabic words} * \left(\frac{30}{\text{number of sentences}}\right)} + 3.1291$$

## Colemen-Liau Index
- It relies on characters rather than syllables per word
- It is calculated using the formula:
    $$0.0588L - 0.296S - 15.8$$
    $L$ is the average number of letters per 100 words,
    $S$ is the average number of sentences per 100 words

## Automated Readability Index
- It uses character count, word count, and sentence count to assess readability
- It is calculated using the formula:
    $$4.71 \left(\frac{\text{total characters}}{\text{total words}}\right) + 0.5\left(\frac{\text{total words}}{\text{total sentences}}\right) - 21.43$$

## Difficult Words
- "Difficult words" typically refer to words not found on a list of common words that children in the fourth grade are expected to know
- The calculation involves counting these words

## Dale-Chall Score
- It considers the percentage of difficult words and the average sentence length
- It uses a list of 3,000 common words familiar to 4th-grade students
- It is calculated using the formula:
    $$0.1579\left(\frac{\text{difficult words}}{\text{total words}} * 100\right) + 0.0496\left(\frac{\text{total words}}{\text{total sentences}}\right)$$
- If the percentage of difficult words is above 5%, an adjustment is made:
    $$\text{Adjusted Score} = \text{Raw Score} + 3.6365$$

## Linsear Write Formula
- It based on the number of easy words (one syllable) and hard words (two or more syllables)
- The process involves:
    1. Take a 100-words sample
    2. Count the number of easy words and multiply by 1
    3. Count the number of hard words and multiply by 3
    4. Add these two numbers and divide by the number of sentences in the sample
    5. Adjust by dividing the result by 2
- If the resulting score is greater than 20, subtract 2

## Gunning-Fog Index
- It estimates the years of formal education a reader needs to understand a text on the first reading
- It focuses on sentence length and complex words (three or more syllables)
- It is calculated using the formula:
    $$0.4\left(\left(\frac{\text{total words}}{\text{total sentences}}\right) + 100\left(\frac{\text{complex words}}{\text{total words}}\right)\right)$$