In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
from spacy.matcher import Matcher
import re
import readability # https://pypi.org/project/readability/

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
dfSheets = pd.read_excel('../Data/TemplateComparisonAnalytics.xlsx', 
                         header = 4,
                         sheet_name = None)

In [4]:
dfSheets["FLEX free"].head(5)

Unnamed: 0,ID,Text,name_of_template,"liability? (IR1,E106,Rupp09)","structured_sentence? (IR2,SR14-15,E106)","#sentences (IR11+18,SR9,E106)",#words (SR15),#words/sentence,#characters,#syllables,...,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73
0,1.0,The Thermal Control shall fulfil the requireme...,,1.0,1.0,1,18,18,115,32.0,...,,,,,,,,,,
1,2.0,The list of requirements from [ND-E-15] and [N...,,0.0,0.0,1,26,26,141,45.0,...,,,,,,,,,,
2,3.0,The Thermal Control shall provide the thermal ...,,1.0,1.0,1,29,29,159,51.0,...,,,,,,,,,,
3,4.0,The Thermal Control shall ensure temperatures ...,,1.0,1.0,1,22,22,137,44.0,...,,,,,,,,,,
4,5.0,The Thermal Control shall ensure the temperatu...,,1.0,1.0,1,16,16,110,36.0,...,,,,,,,,,,


In [5]:
def calc_FScore(text):
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # function to test if something is a pronoun
    is_pronoun = lambda pos: pos[:3] == 'PRP'
    # function to test if something is an adjective
    is_adj = lambda pos: pos[:2] == 'JJ'
    # function to test if something is a preposition
    is_prep = lambda pos: pos[:2] == 'IN'
    # function to test if something is an article
    is_article = lambda pos: pos[:2] == 'DT'
    # function to test if something is a verb
    is_verb = lambda pos: pos[:2] == 'VB'
    # function to test if something is an adverb
    is_adverb = lambda pos: pos[:2] == 'RB'
    # function to test if something is a interjection
    is_interj = lambda pos: pos[:2] == 'UH'

    # do the nlp stuff
    tokenized = nltk.word_tokenize(text)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    pronouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_pronoun(pos)] 
    adjectives = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adj(pos)] 
    prepositions = [word for (word, pos) in nltk.pos_tag(tokenized) if is_prep(pos)] 
    articles = [word for (word, pos) in nltk.pos_tag(tokenized) if is_article(pos)] 
    verbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_verb(pos)] 
    adverbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adverb(pos)] 
    interjections = [word for (word, pos) in nltk.pos_tag(tokenized) if is_interj(pos)] 
    
    textLen = len(text)

    FScore = (len(nouns)/textLen*100 + len(adjectives)/textLen*100 + len(prepositions)/textLen*100 + len(articles)/textLen*100 - len(pronouns)/textLen*100 - len(verbs)/textLen*100 - len(adverbs)/textLen*100 - len(interjections)/textLen*100 + 100)/2
      
    return FScore

In [6]:
def count_syllables(word) :
    word = word.lower()

    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables

    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']

    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']

    pre_one = ['preach']

    syls = 0 #added syllable number
    disc = 0 #discarded syllable number

    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls

    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)

    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    #3) discard trailing "e", except where ending is "le"  

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass

        else :
            disc+=1

    #4) check if consecutive vowels exists, triplets or pairs, count them as one.

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple

    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))

    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1

    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1

    #8) add one if "y" is surrounded by non-vowels and is not in the last word.

    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1

    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1

    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"

    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1

    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:2] == "co" and word[2] in 'eaoui' :

        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1

    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1

    #13) check for "-n't" and cross match with dictionary to add syllable.

    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass   

    #14) Handling the exceptional words.

    if word in exception_del :
        disc+=1

    if word in exception_add :
        syls+=1     

    # calculate the output
    return numVowels - disc + syls

In [7]:
# Add definite_articles?  (IR5,SR10-11)
def if_definite_articles(text): 
    article_terms = ['a']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in article_terms:
            return 0  
     
    return 1

In [8]:
# Add no_nominalization? (SR3)
def no_nominalization(text):    
    result = readability.getmeasures(text, lang='en')['word usage']['nominalization']
    return (1 if result == 0 else 0)

In [9]:
# Add no_comparison? (SR8,Rupp09)
# Add clear_comparison? (SR8) 

def no_comparison(text):
    doc = nlp(text) 
    for token in doc:
        if (token.tag_ == "RBR" or token.tag_ == "JJR"): 
            return 0        
    return 1

In [10]:
# Add units? (IR6)
def correct_units(text):
    for i in text:
        if i.isdigit():
            return 0
        
    return 1

In [11]:
# Add no_vague_terms? (IR7,SR2+12,E106)
def no_vague_terms(text):
    vague_terms_simple = ['some', 'any', 'allowable ', 'several', 'many', 'nearly ', 'about', 'almost',
                          'approximate', 'ancillary','relevant', 'routine', 'common', 'generic', 'significant', 
                          'flexible', 'expandable', 'typical', 'sufficient', 'adequate', 'appropriate', 'efficient', 
                          'effective', 'proficient', 'reasonable', 'customary', 'usually', 'approximately', 
                          'sufficiently', 'typically']  
    vague_terms_complex = ['a lot of', 'a few', 'almost always', 'very nearly',  'close to'] 
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in vague_terms_simple:
            return 0  
     
    result = [word for word in vague_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [12]:
# Add no_escape_clause? (IR8)
def no_escape_clause(text):
    escape_clause = ['so far as is possible', 'as little as possible', 'where possible', 
                     'as much as possible', 'if it should prove necessary', 'if necessary', 
                     'to the extent necessary', 'as appropriate', 'as required', 
                     'to the extent practical', 'if practicable']  
    
    result = [word for word in escape_clause if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [13]:
# Add no_open_end? (IR9)
def no_open_end(text):    
    open_end_terms_simple = ['etc']  
    open_end_terms_complex = ['and so on', 'including but not limited to']  
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in open_end_terms_simple:
            return 0  
     
    result = [word for word in open_end_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [14]:
# Add no_superfluous_infinitives? (IR10)
def no_superfluous_infinitives(text):
    superfluous_infinitives = ['be designed', 'be able to', 'be capable to', 'be capable of']  
    
    result = [word for word in superfluous_infinitives if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [15]:
# Add no_negation? (IR16,E106)
def no_negation(text):
    negation_terms = ['not']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in negation_terms:
            return 0  
     
    return 1

In [16]:
# Add no_combinators? (IR19,SR9,E106)
def no_combinators(text):
    combinator_terms_simple = ['and', 'or', 'then', 'unless', 'but', 'however', 'also','whether', 'meanwhile',
                               'whereas', 'otherwise']  
    combinator_terms_complex = ['as well as', 'but also', 'on the other hand', ]  
    
    doc = nlp(text.lower())     
    for token in doc:
        if token.text in combinator_terms_simple:
            return 0  
     
    result = [word for word in combinator_terms_complex if(word in text.lower())] 
    
    return (0 if bool(result) else 1)

In [17]:
# Add clear_quantifiers? (IR32+34,SR8+10-11,E106)
def clear_quantifiers(text):
    quantifiers_terms = ['all', 'any', 'both”']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in quantifiers_terms:
            return 0  
     
    return 1

In [18]:
# Add no_absolutes? (IR26) - evaluate cells with '0' manually
def no_absolutes(text):
    absolutes_terms = ['100%', '100 %', 'all', 'always', 'never']  
    
    doc = nlp(text.lower()) 
    
    for token in doc:
        if token.text in absolutes_terms:
            return 0  
     
    return 1

In [19]:
# Add no_pronouns? (IR24)
def no_pronouns(text):
    doc = nlp(text.lower()) 
    for token in doc:
        if (token.pos_ == "PRON"):
            return 0        
    return 1

In [20]:
# Excel sheet column names used for calculation
textHeader = "Text"
FScoreHeader = "FScore"
syllablesHeader = "#syllables"
definite_articlesHeader = "definite_articles?  (IR5,SR10-11)"
no_nominalizationHeader = "no_nominalization? (SR3)"
no_comparisonHeader = "no_comparison? (SR8,Rupp09)"
clear_comparisonHeader = "clear_comparison? (SR8)"
unitsHeader = "units? (IR6)"
no_vague_termsHeader = "no_vague_terms? (IR7,SR2+12,E106)"
no_escape_clauseHeader = "no_escape_clause? (IR8)"
no_open_endHeader = "no_open_end? (IR9)"
no_superfluous_infinitivesHeader = "no_superfluous_infinitives? (IR10)"
no_negationHeader = "no_negation? (IR16,E106)"
no_combinatorsHeader = "no_combinators? (IR19,SR9,E106)"
no_pronounsHeader = "no_pronouns? (IR24)"
no_absolutesHeader = "no_absolutes? (IR26)"
clear_quantifiersHeader = "clear_quantifiers? (IR32+34,SR8+10-11,E106)"

In [21]:
for dfName, df in dfSheets.items():
    if dfName == "Summary":
        break            
        
    filtered_df = df[df[textHeader].notnull()]
    df[syllablesHeader] = filtered_df[textHeader].apply(count_syllables)
    df[definite_articlesHeader] = filtered_df[textHeader].apply(if_definite_articles)
    df[no_nominalizationHeader] = filtered_df[textHeader].apply(no_nominalization)
    df[no_comparisonHeader] = df[clear_comparisonHeader] = filtered_df[textHeader].apply(no_comparison)
    df[unitsHeader] = filtered_df[textHeader].apply(correct_units) 
    df[no_vague_termsHeader] = filtered_df[textHeader].apply(no_vague_terms)
    df[no_escape_clauseHeader] = filtered_df[textHeader].apply(no_escape_clause)
    df[no_open_endHeader] = filtered_df[textHeader].apply(no_open_end)
    df[no_superfluous_infinitivesHeader] = filtered_df[textHeader].apply(no_superfluous_infinitives)
    df[no_negationHeader] = filtered_df[textHeader].apply(no_negation)
    df[no_combinatorsHeader] = filtered_df[textHeader].apply(no_combinators)
        
    df[no_pronounsHeader] = filtered_df[textHeader].apply(no_pronouns)
    df[no_absolutesHeader] = filtered_df[textHeader].apply(no_absolutes)    
    df[clear_quantifiersHeader] = filtered_df[textHeader].apply(clear_quantifiers)
    
    # Calculate F-Score
    corpus = ' '.join(filtered_df[textHeader])
    FScore = calc_FScore(corpus) 
    df.insert(2, FScoreHeader, np.nan, 1) 
    df.at[0, FScoreHeader] = FScore
    
    print(dfName + ": processed")

FLEX free: processed
FLEX EARS: processed
FLEX MASTER: processed
FLEX advEARS: processed
FLEX boilerplates: processed
FLEX boilerplates (DODT): processed
FLEX SPIDER: processed
ECSS_E60-30 free: processed
ECSS_E60-30 EARS: processed
ECSS_E60-30 MASTER: processed
ECSS_E60-30 advEARS: processed
ECSS_E60-30 boilerplates: processed
ECSS_E60-30 boilerplates (DODT): processed
ECSS_E60-30 SPIDER: processed
CS_E 50 free: processed
CS_E 50 EARS: processed
CS_E 50 MASTER: processed
CS_E 50 advEARS: processed
CS_E 50 boilerplates: processed
CS_E 50 boilerplates (DODT): processed
CS_E 50 SPIDER: processed
TSS free: processed
TSS EARS: processed
TSS MASTER: processed
TSS advEARS: processed
TSS boilerplates: processed
TSS boilerplates (DODT): processed
TSS SPIDER: processed
EVS free: processed
EVS EARS: processed
EVS MASTER: processed
EVS advEARS: processed
EVS boilerplates: processed
EVS boilerplates (DODT): processed
EVS SPIDER: processed


In [22]:
writer = pd.ExcelWriter('TemplateComparison_calculatedMetrics.xlsx', engine='xlsxwriter')

for dfName, df in dfSheets.items():
    if dfName == "Summary":
        break
        
    df.to_excel(writer, dfName, 
                columns = [textHeader, syllablesHeader, definite_articlesHeader, no_nominalizationHeader, 
                           no_comparisonHeader, unitsHeader, no_vague_termsHeader, no_escape_clauseHeader, 
                           no_superfluous_infinitivesHeader, no_negationHeader, no_combinatorsHeader, 
                           no_pronounsHeader, no_absolutesHeader, clear_quantifiersHeader, FScoreHeader])

writer.save()