In [1]:
import pandas as pd
import numpy as np
import nltk
#nltk.download('all') #or alternative all needed ones that are nor yet installed
from nltk.tokenize import sent_tokenize
import spacy
from spacy.matcher import Matcher
import re
import readability # https://pypi.org/project/readability/
from PassivePySrc import PassivePy # https://pypi.org/project/PassivePy/
import syllapy # https://github.com/mholtzscher/syllapy
import stemming # https://pypi.org/project/stemming/
from quantulum3 import parser # https://github.com/nielstron/quantulum3

In [2]:
nlp = spacy.load("en_core_web_sm")
passivepy = PassivePy.PassivePyAnalyzer(spacy_model = "en_core_web_lg")

In [3]:
dfSheets = pd.read_excel('../Data/TemplateComparisonAnalyticsUsertest.xlsx', 
                         header = 0, 
                         sheet_name = None, 
                         skiprows = 12
                        )

In [4]:
#for debugging
#dfSheets["FLEX free"].head(3)

In [5]:
# F-Score according to Heylighen and Dewaele doi: 10.1023/A:1019661126744
def calc_FScore(text):
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # function to test if something is a pronoun
    is_pronoun = lambda pos: pos[:3] == 'PRP'
    # function to test if something is an adjective
    is_adj = lambda pos: pos[:2] == 'JJ'
    # function to test if something is a preposition
    is_prep = lambda pos: pos[:2] == 'IN'
    # function to test if something is an article
    is_article = lambda pos: pos[:2] == 'DT'
    # function to test if something is a verb
    is_verb = lambda pos: pos[:2] == 'VB'
    # function to test if something is an adverb
    is_adverb = lambda pos: pos[:2] == 'RB'
    # function to test if something is a interjection
    is_interj = lambda pos: pos[:2] == 'UH'

    # do the nlp stuff
    tokenized = nltk.word_tokenize(text)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    pronouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_pronoun(pos)] 
    adjectives = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adj(pos)] 
    prepositions = [word for (word, pos) in nltk.pos_tag(tokenized) if is_prep(pos)] 
    articles = [word for (word, pos) in nltk.pos_tag(tokenized) if is_article(pos)] 
    verbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_verb(pos)] 
    adverbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adverb(pos)] 
    interjections = [word for (word, pos) in nltk.pos_tag(tokenized) if is_interj(pos)] 
    
    textLen = len(text)

    FScore = (len(nouns)/textLen*100 + len(adjectives)/textLen*100 + len(prepositions)/textLen*100 + len(articles)/textLen*100 - len(pronouns)/textLen*100 - len(verbs)/textLen*100 - len(adverbs)/textLen*100 - len(interjections)/textLen*100 + 100)/2
      
    return FScore

In [6]:
# syllable count
def count_syllables(word) :
    
    # first try syllapy
    count = syllapy.count(word)
    
    if count != 0 :
        return count
    
    # if syllapy returns 0 follow algorithm from https://eayd.in/?p=232
    word = word.lower()

    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables

    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']

    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']

    pre_one = ['preach']

    syls = 0 #added syllable number
    disc = 0 #discarded syllable number

    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls

    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)

    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    #3) discard trailing "e", except where ending is "le"  

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass

        else :
            disc+=1

    #4) check if consecutive vowels exists, triplets or pairs, count them as one.

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple

    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))

    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1

    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1

    #8) add one if "y" is surrounded by non-vowels and is not in the last word.

    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1

    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1

    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"

    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1

    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:2] == "co" and word[2] in 'eaoui' :

        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1

    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1

    #13) check for "-n't" and cross match with dictionary to add syllable.

    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass   

    #14) Handling the exceptional words.

    if word in exception_del :
        disc+=1

    if word in exception_add :
        syls+=1     

    # calculate the output
    return numVowels - disc + syls

In [7]:
# sentence count (R1)
def sentence_count(text):
    sentences = sent_tokenize(text)
    result = len(sentences)
    return result
# alternative simple Excel formula: =IF(B14="";0;(LEN(B14)-LEN(SUBSTITUTE(B14;". ";"."))+1))
# is less sensitive for bulletpoint lists

In [8]:
# use active voice? (R8)
def active_voice(text):
    result = int(passivepy.match_text(text, full_passive=True, truncated_passive=True)['binary'])
    
    return (1 if result == 0 else 0)

In [9]:
# definite_articles?  (R15)
def if_definite_articles(text): 
    article_terms = ['a', 'an']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in article_terms:
            return 0  
     
    return 1

In [10]:
# no_nominalization? (R10)
def no_nominalization(text):    
    result = readability.getmeasures(text, lang='en')['word usage']['nominalization']
    return (1 if result == 0 else 0)

In [11]:
# no_comparison? (R13)

def no_comparison(text):
    comparison_symbols = ['>', '<', '<=', '>=', '=', 'equals', '≥', '≤', 'greater', 'smaller', 'lower', 'higher'] 
    doc = nlp(text) 
    for token in doc:
        if (token.tag_ == "RBR" or token.tag_ == "JJR" or token.tag_ == "RBS" or token.tag_ == "JJS"): 
            return 0        
        if token.text in comparison_symbols:
            return 0  
    return 1

In [12]:
# units? (R16) -- overestimates --> crosscheck all 0 values manually for false positives
def correct_units(text):
    quants = parser.parse(text)
    if not quants :
        return 1
    for q in quants:
        if q.unit.name == 'dimensionless' :
            #print(parser.inline_parse(text))
            return 0
    return 1

In [13]:
# value tolerances? (R35) -- overestimates --> crosscheck all 0 values manually for false positives
def value_tolerances(text):
    quants = parser.parse(text)
    if not quants :
        return 1
    for i in range(len(quants)):
        q = quants[i]
        if not q.uncertainty : 
            if (q == quants[-1] or quants[i+1].unit.entity.name != q.unit.entity.name) :
                #print(q)
                return 0
    return 1

In [14]:
# no_vague_terms? (R17)
def no_vague_terms(text):
    # lists based on INCOSE Guide for Writing Requirements, ECSS Drafting Rules, and ECSS-E-ST-10-06C
    # simple terms
    vague_terms_simple = ['some', 'any', 'allowable ', 'several', 'many', 'nearly ', 'about', 'almost',
                          'approximate', 'ancillary','relevant', 'routine', 'common', 'uncommon', 'generic',  
                          'significant', 'insignificant', 'flexible', 'expandable', 'typical', 'untypical', 
                          'sufficient', 'insufficient', 'adequate', 'inadequate', 'adequately', 'inadequately', 
                          'appropriate', 'inappropriate', 'efficient', 'inefficient', 'effective', 'ineffective',
                          'proficient', 'reasonable', 'customary', 'usually', 'usual', 'unusual', 'approximately', 
                          'sufficiently', 'insufficiently', 'typically', 'untypically', 'necessary', 'unnecessary', 
                          'minimal', 'maximal', 'minimize', 'maximize', 'optimize', 'minimise', 'maximise', 'optimise', 
                          'rapid', 'rapidly', 'goal', 'user-friendly', 'userfriendly', 'easy', 'enough', 'suitable',
                          'satisfactory', 'quick', 'great', 'small', 'large', 'state-of-the-art', 'recommended', 
                          'preffered', 'normally', 'important', 'quickly', 'slow', 'slowly', 'early', 'timely',
                          'short', 'light', 'heavy', 'thin', 'thick', 'most', 'appropriately', 'proper', 'properly',
                          'qualified', 'objective', 'aim', 'intention', 'purpose', 'consider', 'attention', 'care', 
                          'careful', 'caution', 'and/or', 'foresee', 'need', 'required', 'could', 'permitted', 'permit',
                          'advisable', 'desirable', 'proposed', 'identical', 'responsible', 'ideally', 'preferably',
                          'generally', 'normally', 'considered', 'possible', 'impossible', 'practical', 'impractical',
                          'difficult', 'clean', 'dirty', 'extreme', 'can', 'could', 'declared', 'nearly', 'forseeable',
                          'arbitrary', 'later', 'acceptable', 'inacceptable']  
    # simple terms with exeptions for use in condition statements
    vague_terms_simple_cond = ['long', 'soon']
    # complex vague expressions
    vague_terms_complex = ['a lot of', 'a few', 'almost always', 'very nearly',  'close to', 'user friendly',
                           'first rate', 'best possible', 'state of the art', 'as specified below', 'as shown in', 
                           'as specified above', 'as required', 'unless otherwise required', 'unless otherwise specified',
                           'into account', 'account for', 'in accordance with', 'in compliance with', 'best practice', 
                           'and/or', 'have to', 'minimum interaction', 'state-of-the-art', 'minimum number'] 
    
    doc = nlp(text.lower())     
    for token in doc:
        # check list of simple terms
        if token.text in vague_terms_simple:
            return 0  
        
        # check list of simple terms in combination with their condition expression exeptions
        if token.i > 0:
            prev = token.nbor(-1).text
        else :
            prev = ''
        if token.i < len(doc) -1 :
            nxt = token.nbor().text
        else :
            nxt = ''
        if (token.text in vague_terms_simple_cond and (prev != 'as' or nxt != 'as')):
            return 0
            
    # check list of complex expressions
    result = [word for word in vague_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [15]:
# no_escape_clause? (R18)
def no_escape_clause(text):
    # list based on INCOSE Guide for Writing Requirements, ECSS Drafting Rules, and ECSS-E-ST-10-06C
    escape_clause = ['so far as is possible', 'as little as possible', 'where possible', 
                     'as much as possible', 'if it should prove necessary', 'if necessary', 
                     'to the extent necessary', 'as appropriate', 'as required', 
                     'to the extent practical', 'if practicable', 'if possible', 'almost always', 
                     'best possible', 'if relevant', 'if required', 'if specified', 
                     'unless otherwise required', 'unless otherwise specified',
                     'if appropriate', 'if applicable', 'if needed', 'if feasible', 'when feasible',
                     'if justified', 'no other specification', 'where there is sufficient', 
                     'unless otherwise declared', 'forseeable']  
    
    result = [word for word in escape_clause if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [16]:
# no_open_end? (R19)
def no_open_end(text):    
    open_end_terms_simple = ['etc', 'ia', 'i.a.', 'etc.', 'e.g.']  
    open_end_terms_complex = ['and so on', 'not limited to', 'and so far', 'such as', 'like', 'for example', 'or other']  
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in open_end_terms_simple:
            return 0  
     
    result = [word for word in open_end_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [17]:
# no_superfluous_infinitives? (R20)
def no_superfluous_infinitives(text):
    # list based on INCOSE Guide for Writing Requirements
    superfluous_infinitives = ['be designed', 'be able to', 'be capable to', 'be capable of']  
    
    result = [word for word in superfluous_infinitives if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [18]:
# no_negation? (R22)
def no_negation(text):
    negation_terms = ['not', 'never', 'no', "doesn't", "isn't", "shouldn't", "couldn't",
                      "wouldn't", 'prevent', 'avoid', "didn't"]

    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in negation_terms:
            return 0  
     
    return 1

In [19]:
# no_combinators? (R24)
def no_combinators(text):
    combinator_terms_simple = ['and', 'or', 'then', 'unless', 'but', 'however', 'also','whether', 'meanwhile',
                               'whereas', 'otherwise', 'and/or']  
    combinator_terms_complex = ['as well as', 'but also', 'on the other hand']  
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in combinator_terms_simple:
            return 0  
     
    result = [word for word in combinator_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [20]:
# clear_quantifiers? (R34)
def clear_quantifiers(text):
    quantifiers_terms = ['all', 'any', 'both', 'some', 'many', 'most', 'minimal']  
    quantifier_terms_complex = ['minimum number']
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in quantifiers_terms:
            return 0  
        
    result = [word for word in quantifier_terms_complex if(word in text.lower())] 
     
    return (0 if bool(result) else 1)

In [21]:
# no_absolutes? (R30)
def no_absolutes(text):
    absolutes_terms = ['100%', '100 %', 'all', 'always', 'never', 'any', 'nobody', 'anybody', 'everybody']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in absolutes_terms:
            return 0  
     
    return 1

In [22]:
# no_pronouns? (R28)
def no_pronouns(text):
    doc = nlp(text.lower()) 
    for token in doc:
        if (token.pos_ == "PRON"):
            return 0        
    return 1

In [23]:
# use modal verb for liability (R5)
def liability(text):
    liability_terms = ['shall', 'should', 'will', 'must', "shouldn't", "mustn't"]
    
    doc = nlp(text.lower())
    
    for token in doc:
        if token.text in liability_terms:
            return 1
    
    return 0

In [24]:
# Excel sheet column names used for calculation
textHeader = "Text"
FScoreHeader = "FScore"
syllablesHeader = "#syllables"
definite_articlesHeader = "definite_articles?  (R15)"
no_nominalizationHeader = "no_nominalization? (R10)"
no_comparisonHeader = "no_comparison? (R13)"
#clear_comparisonHeader = "clear_comparison? (R14)"
unitsHeader = "units? (R16)"
no_vague_termsHeader = "no_vague_terms? (R17)"
no_escape_clauseHeader = "no_escape_clause? (R18)"
no_open_endHeader = "no_open_end? (R19)"
no_superfluous_infinitivesHeader = "no_superfluous_infinitives? (R20)"
no_negationHeader = "no_negation? (R22)"
no_combinatorsHeader = "no_combinators? (R24)"
no_pronounsHeader = "no_pronouns? (R28)"
no_absolutesHeader = "no_absolutes? (R30)"
clear_quantifiersHeader = "clear_quantifiers? (R34)"
liabilityHeader = "liability? (R5)"
active_voiceHeader = "active_voice? (R8)"
toleranceHeader = "value_tolerance? (R35)"
sentencesHeader = "#sentences (R1)"

In [25]:
#calculate results

#Dictonary for complete corpus sorted by template system
corpus = {}

# calculate results for each sheet
for dfName, df in dfSheets.items():
    #all data sheets are before 'Summary' - no further sheets are processed from 'Summary' on
    if dfName == "Summary":
        break   
        
    filtered_df = df[df[textHeader].notnull()]
    
           
    df[syllablesHeader] = filtered_df[textHeader].apply(count_syllables)
    df[definite_articlesHeader] = filtered_df[textHeader].apply(if_definite_articles)
    df[no_nominalizationHeader] = filtered_df[textHeader].apply(no_nominalization)
    df[no_comparisonHeader] = filtered_df[textHeader].apply(no_comparison)
    df[unitsHeader] = filtered_df[textHeader].apply(correct_units) 
    df[no_vague_termsHeader] = filtered_df[textHeader].apply(no_vague_terms)
    df[no_escape_clauseHeader] = filtered_df[textHeader].apply(no_escape_clause)
    df[no_open_endHeader] = filtered_df[textHeader].apply(no_open_end)
    df[no_superfluous_infinitivesHeader] = filtered_df[textHeader].apply(no_superfluous_infinitives)
    df[no_negationHeader] = filtered_df[textHeader].apply(no_negation)
    df[no_combinatorsHeader] = filtered_df[textHeader].apply(no_combinators)
    df[no_pronounsHeader] = filtered_df[textHeader].apply(no_pronouns)
    df[no_absolutesHeader] = filtered_df[textHeader].apply(no_absolutes)    
    df[clear_quantifiersHeader] = filtered_df[textHeader].apply(clear_quantifiers)
    df[liabilityHeader] = filtered_df[textHeader].apply(liability)
    df[active_voiceHeader] = filtered_df[textHeader].apply(active_voice)
    df[toleranceHeader] = filtered_df[textHeader].apply(value_tolerances)
    df[no_open_endHeader] = filtered_df[textHeader].apply(no_open_end)
    df[sentencesHeader] = filtered_df[textHeader].apply(sentence_count)
       
    template = dfName.split()[-1]
       
    if template in corpus:
        corpus[template] = corpus[template] + " " + ' '.join(filtered_df[textHeader])
    else:
        corpus[template] = ' '.join(filtered_df[textHeader])
        
    
    # Calculate F-Score for all requirements on sheet
    sheet_corpus = ' '.join(filtered_df[textHeader])
    FScore = calc_FScore(sheet_corpus) 
    df.insert(2, FScoreHeader, np.nan, 1) 
    df.at[0, FScoreHeader] = FScore
    
    #print name of processed sheets to show progress
    print(dfName + ": processed")

#calculate overall F-Scores
print(" ")
print("Total F-Scores")

ffree = 0
for ts, rt in corpus.items():
    FScore = calc_FScore(rt)
    if ts == "free":
        ffree = FScore
        print(ts + ": " + "%0.3f" % round(FScore, 3)) 
    else:
        raw_effect = FScore - ffree
        print(ts + ": " + "%0.3f" % round(FScore, 3) + " (" + "%0.3f" % round(raw_effect, 3) + ")")

Student free: processed
Student EARS: processed
Student MASTER: processed
Professional free: processed
Professional EARS: processed
Professional MASTER: processed
ReadingSamples: processed
 
Total F-Scores
free: 52.326
EARS: 53.281 (0.955)
MASTER: 53.197 (0.872)
ReadingSamples: 53.722 (1.396)


In [26]:
# write results to auxiliary excel file
writer = pd.ExcelWriter('../Data/TemplateComparison_calculatedMetricsUsertest.xlsx', engine='xlsxwriter')

for dfName, df in dfSheets.items():
    #all data sheets are before 'Summary' - no further sheets are processed from 'Summary' on
    if dfName == "Summary":
        break
        
    df.to_excel(writer, dfName, 
                columns = [textHeader, syllablesHeader, definite_articlesHeader, no_nominalizationHeader, 
                           no_comparisonHeader, unitsHeader, no_vague_termsHeader, no_escape_clauseHeader, 
                           no_superfluous_infinitivesHeader, no_negationHeader, no_combinatorsHeader, 
                           no_pronounsHeader, no_absolutesHeader, clear_quantifiersHeader, FScoreHeader, 
                           liabilityHeader, active_voiceHeader, toleranceHeader, no_open_endHeader, sentencesHeader])

writer.close()