In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('WikiLarge_Test.csv')

In [3]:
daleChall = pd.read_csv('dale_chall.txt', header=None)

In [5]:
daleChall_words = daleChall[0].values
daleChall_words

array(['a', 'able', 'aboard', ..., 'yourselves', 'youth', "you've"],
      dtype=object)

In [7]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
     |████████████████████████████████| 1.5 MB 4.1 MB/s            
Collecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp37-cp37m-macosx_10_9_x86_64.whl (288 kB)
     |████████████████████████████████| 288 kB 18.4 MB/s            
Installing collected packages: regex, nltk
Successfully installed nltk-3.6.5 regex-2021.11.10


In [11]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
lst_stopwords = nltk.corpus.stopwords.words("english")
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
  
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()

    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text
data["clean_text"] = data["original_text"].apply(lambda row: \
          preprocess_text(row, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))

[nltk_data] Downloading package stopwords to /Users/harry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/harry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
!pip3 install textstat

Collecting textstat
  Downloading textstat-0.7.2-py3-none-any.whl (101 kB)
     |████████████████████████████████| 101 kB 3.5 MB/s           
[?25hCollecting pyphen
  Downloading pyphen-0.11.0-py3-none-any.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 12.8 MB/s            
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.11.0 textstat-0.7.2


In [13]:
import textstat

def get_dale_chall_score(text):
  sentences_count = textstat.sentence_count(text)
  words_count = len(text.split())
  avg_sentence_length = float(sentences_count/words_count)
  text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
  words = text.split()
  difficult_words_count = 0
  for word in words:
    if word not in daleChall_words:
      difficult_words_count += 1  
  pdw = float(difficult_words_count/words_count)
  dc = (0.1579 *pdw) + (0.0496 * avg_sentence_length)
  return dc

data['dale_chall_score'] = data['original_text'].apply(lambda text: get_dale_chall_score(text))


In [14]:
data['clean_text'].max()

'þrúðr also name one valkyries serve ale einherjar valhalla lrb grímnismál stanza 36 rrb'

In [15]:
data.to_csv('dale_chall.csv')

In [17]:
# using some other popular readability scores as stated in readabilityformulas.net
count = 0
# Flesch Reading Ease score
def get_flesch_reading_ease_score(text):
  global count
  count +=1
  return textstat.flesch_reading_ease(text)
data['flesch reading ease score'] = data['original_text'].apply(lambda text:get_flesch_reading_ease_score(text) )

In [18]:
count = 0
# Gunning Fog
def get_gunning_fog(text):
  global count
  count +=1
  return textstat.gunning_fog(text)
data['gunning fog'] = data['original_text'].apply(lambda text: get_gunning_fog(text) )

In [19]:
count = 0
# Flesch-Kincaid Grade Level
def get_flesch_kincaid_grade_level(text):
  global count
  count +=1
  return textstat.flesch_kincaid_grade(text)
data['flesch kincaid grade level'] = data['original_text'].apply(lambda text: get_flesch_kincaid_grade_level(text))

In [20]:
count = 0
# The Coleman-Liau Index
def get_coleman_liau_index(text):
  global count
  count +=1
  return textstat.coleman_liau_index(text)
data['coleman liau index'] = data['original_text'].apply(lambda text: get_coleman_liau_index(text))

In [21]:
count = 0
# The SMOG Index
def get_smog_index(text):
  global count
  count +=1
  return textstat.smog_index(text)
data['smog index'] = data['original_text'].apply(lambda text: get_smog_index(text))

In [22]:
count = 0
# Automated Readability Index
def get_automated_readability_index(text):
  global count
  count +=1
  return textstat.automated_readability_index(text)
data['automated readability index'] = data['original_text'].apply(lambda text: get_automated_readability_index(text))

In [23]:
count = 0
# Linsear Write Formula
def get_linsear_write_formula(text):
  global count
  count +=1
  return textstat.linsear_write_formula(text)
data['linsear write formula'] = data['original_text'].apply(lambda text: get_linsear_write_formula(text))

In [25]:
data.to_csv('readability_scores_included_test.csv')

In [24]:
data.head()

Unnamed: 0,id,original_text,label,clean_text,dale_chall_score,flesch reading ease score,gunning fog,flesch kincaid grade level,coleman liau index,smog index,automated readability index,linsear write formula
0,0,-2011.0,,2011,0.2075,121.22,0.4,-3.5,-22.21,0.0,2.6,-0.5
1,1,-2011.0,,2011,0.2075,121.22,0.4,-3.5,-22.21,0.0,2.6,-0.5
2,2,-2000.0,,2000,0.2075,121.22,0.4,-3.5,-22.21,0.0,2.6,-0.5
3,3,-1997.0,,1997,0.2075,121.22,0.4,-3.5,-22.21,0.0,2.6,-0.5
4,4,1.636,,1636,0.2075,121.22,0.4,-3.5,-22.21,0.0,2.6,-0.5
