In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import Matcher

import syllapy # https://pypi.org/project/syllapy/
import readability # https://pypi.org/project/readability/

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
dfSheets = pd.read_excel('template_comparison-tss-evs_python.xlsx', sheet_name = None)

In [4]:
# Add syllables
def count_syllables(text):
    text_no_digit = ''.join([i for i in text if not i.isdigit()])
    syllablesCount = syllapy.count(text_no_digit)
    
    return syllablesCount

In [5]:
# Add definite_articles?  (IR5,SR10-11)
def if_definite_articles(text): 
    article_terms = ['a']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in article_terms:
            return 0  
     
    return 1

In [6]:
# Add no_nominalization? (SR3)
def no_nominalization(text):    
    result = readability.getmeasures(text, lang='en')['word usage']['nominalization']
    return (1 if result == 0 else 0)

In [7]:
# Add no_comparison? (SR8,Rupp09)
# Add clear_comparison? (SR8) - evaluate cells with '0' manually

def no_comparison(text):
    doc = nlp(text) 
    for token in doc:
        if (token.tag_ == "RBR" or token.tag_ == "JJR"): 
            return 0        
    return 1

In [8]:
# Add units? (IR6) - evaluate cells with '0' manually
def correct_units(text):
    for i in text:
        if i.isdigit():
            return 0
        
    return 1

In [9]:
# Add no_vague_terms? (IR7,SR2+12,E106)
def no_vague_terms(text):
    vague_terms_simple = ['some', 'any', 'allowable ', 'several', 'many', 'nearly ', 'about', 'almost',
                          'approximate', 'ancillary','relevant', 'routine', 'common', 'generic', 'significant', 
                          'flexible', 'expandable', 'typical', 'sufficient', 'adequate', 'appropriate', 'efficient', 
                          'effective', 'proficient', 'reasonable', 'customary', 'usually', 'approximately', 
                          'sufficiently', 'typically']  
    vague_terms_complex = ['a lot of', 'a few', 'almost always', 'very nearly',  'close to'] 
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in vague_terms_simple:
            return 0  
     
    result = [word for word in vague_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [10]:
# Add no_escape_clause? (IR8)
def no_escape_clause(text):
    escape_clause = ['so far as is possible', 'as little as possible', 'where possible', 
                     'as much as possible', 'if it should prove necessary', 'if necessary', 
                     'to the extent necessary', 'as appropriate', 'as required', 
                     'to the extent practical', 'if practicable']  
    
    result = [word for word in escape_clause if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [11]:
# Add no_open_end? (IR9)
def no_open_end(text):    
    open_end_terms_simple = ['etc']  
    open_end_terms_complex = ['and so on', 'including but not limited to']  
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in open_end_terms_simple:
            return 0  
     
    result = [word for word in open_end_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [12]:
# Add no_superfluous_infinitives? (IR10)
def no_superfluous_infinitives(text):
    superfluous_infinitives = ['be designed', 'be able to', 'be capable to', 'be capable of']  
    
    result = [word for word in superfluous_infinitives if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [13]:
# Add no_negation? (IR16,E106)
def no_negation(text):
    negation_terms = ['not']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in negation_terms:
            return 0  
     
    return 1

In [14]:
# Add no_combinators? (IR19,SR9,E106)
def no_combinators(text):
    combinator_terms_simple = ['and', 'or', 'then', 'unless', 'but', 'however', 'also','whether', 'meanwhile',
                               'whereas', 'otherwise']  
    combinator_terms_complex = ['as well as', 'but also', 'on the other hand', ]  
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in combinator_terms_simple:
            return 0  
     
    result = [word for word in combinator_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [15]:
# Add clear_quantifiers? (IR32+34,SR8+10-11,E106) - evaluate cells with '0' manually
def clear_quantifiers(text):
    quantifiers_terms = ['all', 'any', 'both”']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in quantifiers_terms:
            print(token.text)
            return 0  
     
    return 1

In [16]:
# Add no_absolutes? (IR26) - evaluate cells with '0' manually
def no_absolutes(text):
    absolutes_terms = ['100%', '100 %', 'all', 'always', 'never']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in absolutes_terms:
            return 0  
     
    return 1

In [17]:
# Add no_pronouns? (IR24)
def no_pronouns(text):
    doc = nlp(text.lower()) 
    for token in doc:
        if (token.pos_ == "PRON"):
            return 0        
    return 1

In [18]:
# Add fake 1 - evaluate cells with '0' manually
def add_fake(text):    
    return 1

In [19]:
for dfName, df in dfSheets.items():
    df["#syllables"] = df["Text"].apply(count_syllables)
    df["definite_articles?  (IR5,SR10-11)"] = df["Text"].apply(if_definite_articles)
    df["no_nominalization? (SR3)"] = df["Text"].apply(no_nominalization)
    # Add clear_comparison? (SR8) - evaluate cells with '0' manually
    df["no_comparison? (SR8,Rupp09)"] = df["clear_comparison? (SR8)"] = df["Text"].apply(no_comparison)
    df["units? (IR6)"] = df["Text"].apply(correct_units) #  - evaluate cells with '0' manually
    df["no_vague_terms? (IR7,SR2+12,E106)"] = df["Text"].apply(no_vague_terms)
    df["no_escape_clause? (IR8)"] = df["Text"].apply(no_escape_clause)
    df["no_open_end? (IR9)"] = df["Text"].apply(no_open_end)
    df["no_superfluous_infinitives? (IR10)"] = df["Text"].apply(no_superfluous_infinitives)
    df["no_negation? (IR16,E106)"] = df["Text"].apply(no_negation)
    df["no_combinators? (IR19,SR9,E106)"] = df["Text"].apply(no_combinators)
    
    
    df["no_pronouns? (IR24)"] = df["Text"].apply(no_pronouns)
    # evaluate cells with '0' manually - print
    df["no_absolutes? (IR26)"] = df["Text"].apply(no_absolutes)    
    # evaluate cells with '0' manually
    df["clear_quantifiers? (IR32+34,SR8+10-11,E106)"] = df["Text"].apply(clear_quantifiers)
    
    df["contextfree? (IR25,SR6-7+16-18,E106)"] = df["Text"].apply(add_fake)
    df["no_groupnoun? (IR22,SR10-12)"] = df["Text"].apply(add_fake)
    df["explicit_conditions? (IR27,SR11+16-18)"] = df["Text"].apply(add_fake)    
    df["condition_combination_clear? (IR28,SR16-18)"] = df["Text"].apply(add_fake)    
    df["solution_free? (IR31,E106)"] = df["Text"].apply(add_fake)
    df["value_tolerance? (IR33,E106)"] = df["Text"].apply(add_fake)

there
it
there
it
always
all
all
all
all
all
all
all
all
all
all
all
any
all
there
it
there
it
it
it
always
all
all
all
all
all
all
all
all
all
all
all
any
all
there
it
there
it
it
always
always
all
all
all
all
all
all
all
all
all
all
all
any
all
there
there
it
always
always
all
all
all
all
all
all
all
all
all
all
all
any
all
there
there
it
always
always
all
all
all
all
all
all
all
all
all
all
all
any
all
there
there
it
always
always
all
all
all
all
all
all
all
all
all
all
all
any
all
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
there
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
it
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
always
all
all
always
always
always
always
always
always
always
always
always
always
always
always
al

In [20]:
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')

for dfName, df in dfSheets.items():
    df.to_excel(writer, dfName)

writer.save()

In [21]:
text = ('The TSS shall provide assistants and supervisors with the ability to manage contracts. Manage means „CRUD“ (Create, Read, Update, Delete). Update and deletion of contracts is only possible in state PREPARED.')
results = readability.getmeasures(text, lang='en')
print(results['readability grades'])
print("==============================")
print(results['sentence info'])
print("==============================")
print(results['word usage'])
print("==============================")
print(results['sentence beginnings'])

OrderedDict([('Kincaid', 13.629032258064516), ('ARI', 19.443225806451615), ('Coleman-Liau', 14.920093967741941), ('FleschReadingEase', 52.563548387096795), ('GunningFogIndex', 18.851612903225806), ('LIX', 60.03225806451613), ('SMOGIndex', 15.24744871391589), ('RIX', 9.0), ('DaleChallIndex', 13.323777419354839)])
OrderedDict([('characters_per_word', 5.387096774193548), ('syll_per_word', 1.4516129032258065), ('words_per_sentence', 31.0), ('sentences_per_paragraph', 1.0), ('type_token_ratio', 0.9032258064516129), ('characters', 167), ('syllables', 45), ('words', 31), ('wordtypes', 28), ('sentences', 1), ('paragraphs', 1), ('long_words', 9), ('complex_words', 5), ('complex_words_dc', 16)])
OrderedDict([('tobeverb', 1), ('auxverb', 1), ('conjunction', 2), ('pronoun', 0), ('preposition', 4), ('nominalization', 1)])
OrderedDict([('pronoun', 0), ('interrogative', 0), ('article', 1), ('subordination', 0), ('conjunction', 0), ('preposition', 0)])
