In [1]:
import spacy, csv 
from textstat.textstat import textstatistics, easy_word_set, legacy_round 
from __future__ import unicode_literals, print_function
import nltk
import re
import io
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.matcher import PhraseMatcher
import en_core_web_sm
import pandas as pd
import glob
import warnings
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# Splits the text into sentences, using 
# Spacy's sentence segmentation which can 
# be found at https://spacy.io/usage/spacy-101 
def break_sentences(text): 
    global nlp
    doc = nlp(text) 
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences

In [3]:
# Returns Number of Words in the text 
def word_count(text): 
    sentences = break_sentences(text) 
    words = 0
    for sentence in sentences: 
        words += len([token for token in sentence]) 
    return words 

In [4]:
# Returns the number of sentences in the text 
def sentence_count(text): 
    sentences = break_sentences(text) 
    return len(sentences)

In [5]:
# Returns average sentence length 
def avg_sentence_length(text): 
    words = word_count(text) 
    sentences = sentence_count(text) 
    average_sentence_length = float(words / sentences) 
    return average_sentence_length

In [6]:
# Textstat is a python package, to calculate statistics from 
# text to determine readability, 
# complexity and grade level of a particular corpus. 
# Package can be found at https://pypi.python.org/pypi/textstat 
def syllables_count(word): 
    return textstatistics().syllable_count(str(word)) 

In [7]:
# Returns the average number of syllables per 
# word in the text 
def avg_syllables_per_word(text): 
    syllable = syllables_count(text) 
    words = word_count(text) 
    ASPW = float(syllable) / float(words) 
    return legacy_round(ASPW, 1) 

In [8]:
# Return total Difficult Words in a text 
def difficult_words(text): 

    # Find all words in the text 
    words = [] 
    sentences = break_sentences(text) 
    for sentence in sentences: 
        words += [token for token in sentence] 

    # difficult words are those with syllables >= 2 
    # easy_word_set is provide by Textstat as 
    # a list of common words 
    diff_words_set = set() 
    
    for word in words: 
        syllable_count = syllables_count(word) 
        if word not in easy_word_set and syllable_count >= 2: 
            diff_words_set.add(word) 

    return len(diff_words_set) 

In [9]:
# A word is polysyllablic if it has more than 3 syllables 
# this functions returns the number of all such words 
# present in the text 
def poly_syllable_count(text): 
    count = 0
    words = [] 
    sentences = break_sentences(text) 
    for sentence in sentences: 
        words += [token for token in sentence] 


    for word in words: 
        syllable_count = syllables_count(word)
        if syllable_count >= 3: 
            count += 1
    return count 

In [10]:
def flesch_reading_ease(text): 
    """ 
        Implements Flesch Formula: 
        Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW) 
        Here, 
        ASL = average sentence length (number of words 
            divided by number of sentences) 
            ASW = average word length in syllables (number of syllables 
            divided by number of words) 
    """
    FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -float(84.6 * avg_syllables_per_word(text)) 
    return legacy_round(FRE, 2) 

In [11]:
def gunning_fog(text): 
    per_diff_words = (difficult_words(text) / word_count(text) * 100) + 5
    grade = 0.4 * (avg_sentence_length(text) + per_diff_words) 
    return grade 

In [12]:
def smog_index(text): 
    """ 
        Implements SMOG Formula / Grading 
        SMOG grading = 3 + ?polysyllable count. 
        Here, polysyllable count = number of words of more 
        than two syllables in a sample of 30 sentences. 
    """

    if sentence_count(text) >= 3: 
        poly_syllab = poly_syllable_count(text) 
        SMOG = (1.043 * (30*(poly_syllab / sentence_count(text)))**0.5) + 3.1291
        return legacy_round(SMOG, 1) 
    else: 
        return 0

In [13]:
def dale_chall_readability_score(text): 
    """ 
        Implements Dale Challe Formula: 
        Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365 
        Here, PDW = Percentage of difficult words. ASL = Average sentence length 
    """
    words = word_count(text) 
    # Number of words not termed as difficult words 
    count = words - difficult_words(text) 
    if words > 0: 

        # Percentage of words not on difficult word list 
        per = float(count) / float(words) * 100

    # diff_words stores percentage of difficult words 
    diff_words = 100 - per 

    raw_score = (0.1579 * diff_words) + (0.0496 * avg_sentence_length(text)) 

    # If Percentage of Difficult Words is greater than 5 %, then; 
    # Adjusted Score = Raw Score + 3.6365, 
    # otherwise Adjusted Score = Raw Score 

    if diff_words > 5:

        raw_score += 3.6365

    return legacy_round(raw_score, 2) 

In [14]:
def Stemming_word(text):
    ps = PorterStemmer()
    nltk.download('punkt')
    nlp = spacy.blank('en')
    target = ps.stem('cookies')
    words = word_tokenize(text)
    for w in words:
        if ps.stem(w)==target:
            return (1)
            break 

In [15]:
def pharse_test(nlp,sell_data,not_sell_data,share_data,
                not_share_data,purpose,disclosure,security):

    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('is_sell', None, *sell_data)
    matcher.add('is_not_sell', None, *not_sell_data)
    matcher.add('is_share', None, *share_data)
    matcher.add('is_not_share', None, *not_share_data)
    matcher.add('is_purpose', None, *purpose)
    matcher.add('is_disclosure',None,*disclosure)
    matcher.add('is_security',None,*security)

    list_data = []
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  
        list_data.append(rule_id)
    

    is_sell = int('is_sell' in list_data)
    is_not_sell = int('is_not_sell' in list_data)
    is_share = int('is_share' in list_data)
    is_not_share = int('is_not_share' in list_data)
    is_purpose = int('is_purpose' in list_data)
    is_disclosure = int('is_disclosure' in list_data)
    is_security = int('is_security' in list_data)

    return(is_sell,is_not_sell,is_share,is_not_share,is_purpose,is_disclosure,is_security)
    

In [16]:
df = pd.DataFrame(columns = ['id','is_minor','is_geo-location','is_email','is_vendor',
                             'is_not_sell','is_sell','is_share','is_not_share','is_purpose',
                             'is_disclosure','is_security','is_cookies','gunning_fog',
                             'smog_index','avg_sentence_length','flesch_reading_ease',
                             'dale_chall_readability_score'])

In [17]:
# capture text from the document
nlp = en_core_web_sm.load()
sell_data = [nlp(text) for text in ('whether they sell personal data')]
not_sell_data = [nlp(text) for text in ('whether they do not sell personal data')]
share_data = [nlp(text) for text in ('they share personal data','protect personal Information')]
not_share_data = [nlp(text) for text in ('whether they do not share personal data')]
purpose = [nlp(text) for text in ('other purposes', 'how they use it')]
disclosure = [nlp(text) for text in ('who is collecting their data', 'they disclose the information collected','other entities are collecting information')]
security = [nlp(text) for text in ('security','secure')]
#

for filename in glob.glob('*.txt'):
    with open(filename) as f:
        text = (" ".join(line.strip() for line in f))
    file_name = f.name
    text = text.lower()

    file_id = int(''.join([n for n in file_name if n.isdigit()]))
    dale_chall = dale_chall_readability_score(text)
    flesch = flesch_reading_ease(text)
    smog = smog_index(text)
    gunning = gunning_fog(text)
    avg_sentence_len = avg_sentence_length(text)

    minor_search = int(bool(re.search('minor', text))) 
    geo_search = int(bool(re.search('geo-location', text)))
    email_search = int(bool(re.search(r'[\w\.-]+@[\w\.-]+', text)))
    vendor_search = int(bool(re.search('vendor', text)))
    
    parser_output = pharse_test(nlp,sell_data,not_sell_data,share_data,
                                 not_share_data,purpose,disclosure,security)
    
    cookies_search = Stemming_word(text)
    if cookies_search == None:
        cookies_search = 0

    dict_value = {'id':[file_id], 
                  'is_minor':[minor_search],
                  'is_geo-location':[geo_search],
                  'is_email':[email_search],
                  'is_vendor':[vendor_search],
                  'is_sell':[parser_output[0]],
                  'is_not_sell':[parser_output[1]],
                  'is_share':[parser_output[2]],
                  'is_not_share':[parser_output[3]],
                  'is_purpose':[parser_output[4]],
                  'is_disclosure':[parser_output[5]],
                  'is_security':[parser_output[6]],
                  'is_cookies':[cookies_search],
                  'gunning_fog':[gunning],
                  'smog_index':[smog],
                  'avg_sentence_length':[avg_sentence_len],
                  'flesch_reading_ease':[flesch],
                  'dale_chall_readability_score':[dale_chall]}

    
    df1 = pd.DataFrame.from_dict(dict_value)
    df = pd.concat([df,df1])
df = df.reset_index()
df = df[['id','is_minor','is_geo-location','is_email','is_vendor',
         'is_not_sell','is_sell','is_share','is_not_share',
         'is_cookies','gunning_fog','smog_index','avg_sentence_length',
         'flesch_reading_ease','dale_chall_readability_score']]

file_name = 'GCSI_Thread_1.csv'
df.to_csv(file_name, index = False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\