In [128]:
# Import Statements
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from glob import glob
import numpy as np
import os
import pandas as pd
import re as re
import string
import nltk
import json

In [678]:
MAX_ANNOTATION_FEATURES = 100
K_SENTENCES = 3

In [58]:
# Function to load privacy policy from JSON
def loadPolicy(fileString):
    with open(fileString, encoding='utf-8') as f:
        return json.load(f)

In [468]:
# Function to organize privacy policy segments into a dictionary
def loadPolicyIntoDict(policy_json):
    section_dict = {}

    for policy_section in policy_json:
        for policy_chunk in policy_json[policy_section]:
            if not policy_chunk['sectionCategory'] in section_dict:
                section_dict[policy_chunk['sectionCategory']] = ""
                             
            section_dict[policy_chunk['sectionCategory']] += (policy_chunk['sectionText'])
    
    return section_dict
        

In [654]:
# Function to clean a string of unwanted chars
def cleanString(string):
    allowed_chars = ["'", ",", ".", ")", "(", " ", "-", ":", ";", "@", "/", ""]
    
    processed_string = ''.join([c for c in string if c.isalpha() or c.isdigit() or c in allowed_chars])
    processed_string = re.sub(r"\-", " ", string)
    processed_string = re.sub(r"/n", " ", processed_string)
    processed_string = re.sub(r"\s+", " ", processed_string)
    processed_string = re.sub(r" \.", ".", processed_string)
    processed_string = re.sub(r" ,", ",", processed_string)
    processed_string = re.sub(r" : ", ": ", processed_string)
    
    return processed_string

In [616]:
# Function to clean policy sections (Sentence-level cleaning)
def cleanPolicySections(section_dict): 
    for (section_label, section_text) in section_dict.items():
        section_dict[section_label] = cleanString(section_text)

In [62]:
# Function to convert policy sections to lower case
def convertPolicySectionsToLower(section_dict):
    for (section_label, section_text) in section_dict.items():
        section_dict[section_label] = section_text.lower()

In [63]:
# Function to tokenize privacy policy sections at the sentence level
def tokenizeSectionSentences(section_dict):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentence_tokenized_sections = {}
    
    for (section_label, section_text) in section_dict.items():
        sentence_tokenized_sections[section_label] = sent_detector.tokenize(section_text)
        
    return sentence_tokenized_sections

In [94]:
# Function to remove stopwords from a string
def removeStopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_string_array = [word for word in word_tokens if not word in stop_words]
    filtered_string  = ' '.join(filtered_string_array)
    return filtered_string

In [95]:
# Function to remove stopwords from each sentence in each section, retaining sentence structure
def removeStopWordsRetainSentence(sentence_tokenized_sections):
    sentence_tokenized_sections_no_stops = {}
    
    for (section_label, section_text) in sentence_tokenized_sections.items():
        sentence_tokenized_sections_no_stops[section_label] = []
        for sentence in section_text:
            sentence_tokenized_sections_no_stops[section_label].append(removeStopwords(sentence))
    
    return sentence_tokenized_sections_no_stops

In [101]:
# function to remove punctuation from a string
def removePunctuationFromString(text):
    return ''.join(text).translate(str.maketrans('', '', string.punctuation))

In [102]:
# Function to remove punctuation from each sentence in each section
def removePunctuationFromSections(sentence_tokenized_sections):
    sentence_tokenized_sections_no_punct = {}
    
    for (section_label, section_text) in sentence_tokenized_sections.items():
        sentence_tokenized_sections_no_punct[section_label] = []
        for sentence in section_text:
            sentence_tokenized_sections_no_punct[section_label].append(removePunctuationFromString(sentence))
            
    return sentence_tokenized_sections_no_punct

In [114]:
# Fuction to stem all words in a string, returning an array of word stems
def stemString(text):
    ps = PorterStemmer()
    word_tokens = word_tokenize(text)
    return [ps.stem(word) for word in word_tokens]

In [418]:
# Function to stem all words in each sentence for every privacy policy section
def stemSections(sections_dict):
    stemmed_sections = {}
    
    for (section_label, section_text) in sections_dict.items():
        stemmed_sections[section_label] = []
        for sentence in section_text:
            stemmed_sections[section_label].append(' '.join(stemString(sentence)))
            
    return stemmed_sections

In [119]:
# Function to perform a full cleaning and stemming of a string, return an array of cleaned and stemmed words
def stringWash(text):
    clean_text = cleanString(text).lower()
    clean_text = removeStopwords(clean_text)
    clean_text = removePunctuationFromString(clean_text)
    return stemString(clean_text)

In [116]:
# Function to create a dictionary of DataFrames for each privacy policy section
def generateSectionDataFrames(sections_dict):
    section_dfs = {}
    
    for (section_label, section_text_array) in sections_dict.items():
        section_dfs[section_label] = pd.DataFrame(section_text_array, columns=['sentences'])
        
    return section_dfs

In [692]:
# Load twitch privacy policy segments and clean data
twitch_policy_json = loadPolicy('../Data/Privacy-Sections/Twitch.js')
twitch_section_dict = loadPolicyIntoDict(twitch_policy_json)
cleanPolicySections(twitch_section_dict)

# Keep a copy of the twitch privacy policy segments with case
twitch_section_dict_with_case = twitch_section_dict.copy()

# Convert twitch privacy policy segments to lower case
convertPolicySectionsToLower(twitch_section_dict)

# Tokenize twitch policy sections (lower case) at the sentence level
sentence_tokenized_sections_twitch = tokenizeSectionSentences(twitch_section_dict)

# Remove stopwords from each sentence in the twitch privacy policy, retaining sentence-level structuring
sentence_tokenized_sections_twitch_no_stops = removeStopWordsRetainSentence(sentence_tokenized_sections_twitch)

# Remove punctuation from each sentence in twitch privacy policy
sentence_tokenized_sections_twitch_no_stops_no_punct = removePunctuationFromSections(sentence_tokenized_sections_twitch_no_stops)

# Stem the text of each sentence in each section - each sentence is now an array of stemmed words
stemmed_sections = stemSections(sentence_tokenized_sections_twitch_no_stops_no_punct)

# Create a dictionary of DataFrames for each stemmed, cleaned section (columns=['sentence index', 'sentences'])
section_dfs = generateSectionDataFrames(stemmed_sections)

In [14]:
# Function to concatenate all OPP-115 annotation files into a single CSV file
def concatenateAnnotationFiles(list_of_files, file_out):  
    for file in list_of_files:
        appended_annotations = pd.concat([pd.read_csv(file, usecols=[5, 6], names=["Section", "Annotations"]) for file in list_of_files])
        appended_annotations.to_csv(file_out, index=False, encoding="utf-8")
        

In [14]:
# Concatenate all OPP-115 annotation files into a single CSV file
working_dir = '..\\Privacy-Policy-Summary\\Text-Summarizer'
csv_file_path = '..\\Privacy-Policy-Summary\\Data\\OPP-115\\annotations'
output_file ='..\\Privacy-Policy-Summary\\Data\\Collected-Annotations\\collectedAnnotations.csv'

os.chdir(csv_file_path)
file_pattern = ".csv"
list_of_files = os.listdir()

annotations_df = concatenateAnnotationFiles(list_of_files, output_file)

os.chdir(working_dir)

In [15]:
# Generate a single DataFrame for the OPP-115 annotation CSV files
input_file ='..\\Privacy-Policy-Summary\\Data\\Collected-Annotations\\collectedAnnotations.csv'

annotations_df = pd.read_csv(input_file, names=["Section", "Annotations"])

In [16]:
# Organize all annotations into their respective sections
def collectSectionAnnotations(annotations_df):
    section_categories = annotations_df["Section"].unique()[2:]
    section_annotations = {}

    for section in section_categories:
        section_annotations[section] = []
        section_annotations[section].append(annotations_df.loc[annotations_df["Section"] == section])
    
    return section_annotations


In [120]:
# Organize annotations 
section_annotations_dict = collectSectionAnnotations(annotations_df)

In [317]:
# Function to filter, clean, and generate DataFrames of annotations for each section. Returns a dictionary of DataFrames (columns=['Sub Section', 'Selected Text'])
def generateSectionAnnotationDataFrames(section_annotations_dict):
    section_annotation_df_dict = {}

    for section in section_annotations_dict:
        section_annotation_df = pd.DataFrame(columns=['Sub Section', 'Selected Text'])
        section_annotation_df_dict[section] = section_annotation_df
        for annotation_blob in section_annotations[section]:
            for blob in annotation_blob['Annotations']:
                json_blob = json.loads(blob)
                for sub_section, text in json_blob.items():
                    if (('selectedText' in text) and (text['selectedText'] != 'null') and (text['selectedText'] != 'Not selected')):
                        section_annotation_df_dict[section] = section_annotation_df_dict[section].append({'Sub Section': ' '.join(stringWash(sub_section)), 'Selected Text': ' '.join(stringWash(text['selectedText']))}, ignore_index=True)
    
    return section_annotation_df_dict           

In [None]:
# Filter, clean, and generate DataFrames of annotations for each section.
section_annotation_df_dict = generateSectionAnnotationDataFrames(section_annotations_dict)  

In [244]:
# Function to get max n-gram length for each section's sub-sections
def getMaxSubsectionNGramLengths(section_annotation_df_dict):
    section_sub_section_max_lens = {}
    
    for section, df in section_annotation_df_dict.items():
        sub_section_values = df['Sub Section'].unique().tolist()
        max_ngram = 0      
        for value in sub_section_values:
            word_tokens = word_tokenize(value)
            if len(word_tokens) > max_ngram:
                max_ngram = len(word_tokens)
        section_sub_section_max_lens[section] = max_ngram
    
    return section_sub_section_max_lens


In [337]:
# Get max n-gram length for each section's sub-sections
section_sub_section_max_lens = getMaxSubsectionNGramLengths(section_annotation_df_dict)

In [367]:
# Function to create CountVectorizers and TfidfTransformers for each section's sub-sections
def generateSubSectionPipelines(section_annotation_df_dict, section_sub_section_max_lens):
    section_sub_section_pipelines_dict = {}
    
    for section, df in section_annotation_df_dict.items():
        section_sub_section_pipelines_dict[section] = []
    
        if (section_sub_section_max_lens[section] == 3):
            section_sub_section_pipelines_dict[section].append(Pipeline([('count', CountVectorizer(ngram_range=(1,1))),('tfidf', TfidfTransformer())]).fit(df['Sub Section']))
            section_sub_section_pipelines_dict[section].append(Pipeline([('count', CountVectorizer(ngram_range=(2,2))),('tfidf', TfidfTransformer())]).fit(df['Sub Section']))
            section_sub_section_pipelines_dict[section].append(Pipeline([('count', CountVectorizer(ngram_range=(3,3))),('tfidf', TfidfTransformer())]).fit(df['Sub Section']))
        else:
            section_sub_section_pipelines_dict[section].append(Pipeline([('count', CountVectorizer(ngram_range=(1,1))),('tfidf', TfidfTransformer())]).fit(df['Sub Section']))
            section_sub_section_pipelines_dict[section].append(Pipeline([('count', CountVectorizer(ngram_range=(2,2))),('tfidf', TfidfTransformer())]).fit(df['Sub Section']))
  
    return section_sub_section_pipelines_dict


In [681]:
# Create CountVectorizers and TfidfTransformers for each section's sub-sections
section_sub_section_pipelines_dict = generateSubSectionPipelines(section_annotation_df_dict, section_sub_section_max_lens)

In [313]:
# Function to generate tf-idf scores for every sub-section label in each section 
def generateTfidfSubSectionScores(section_sub_section_pipelines_dict, section_sub_section_max_lens, section_annotation_df_dict):
    section_subsection_tfidf = {}
    
    for section in section_sub_section_pipelines_dict:
        section_subsection_tfidf[section] = {}
        
        for sub_section in section_annotation_df_dict[section]['Sub Section'].unique().tolist():
            sub_section_length = len(word_tokenize(sub_section))
        
            if (sub_section_length == 3):
                for idx, feature_name in enumerate(section_sub_section_pipelines_dict[section][2]['count'].get_feature_names()):
                    if ((sub_section == feature_name) and not(sub_section in section_subsection_tfidf[section])):
                        section_subsection_tfidf[section][sub_section] = section_sub_section_pipelines_dict[section][2]['tfidf'].idf_[idx]
            elif ((sub_section_length) == 2 and not(sub_section in section_subsection_tfidf[section])):
                for idx, feature_name in enumerate(section_sub_section_pipelines_dict[section][1]['count'].get_feature_names()):
                    if ((sub_section == feature_name) and not(sub_section in section_subsection_tfidf[section])):
                        section_subsection_tfidf[section][sub_section] = section_sub_section_pipelines_dict[section][1]['tfidf'].idf_[idx]
            else:
                for idx, feature_name in enumerate(section_sub_section_pipelines_dict[section][0]['count'].get_feature_names()):
                    if ((sub_section == feature_name) and not(sub_section in section_subsection_tfidf[section])):
                        section_subsection_tfidf[section][sub_section] = section_sub_section_pipelines_dict[section][0]['tfidf'].idf_[idx]
    
    return section_subsection_tfidf


In [None]:
# Generate tf-idf scores for every sub-section label in each section
section_subsection_tfidf = generateTfidfSubSectionScores(section_sub_section_pipelines_dict, section_sub_section_max_lens, section_annotation_df_dict)

In [330]:
# Function to noramalize TF-IDF sub-section values to be 0.0 - 1.0
def noramlizeSubSectionTfidf(section_subsection_tfidf):
    normalized_section_subsection_tfidf = {}
    
    for section in section_subsection_tfidf:
        section_tfidf = []
        normalized_section_subsection_tfidf[section] = {}
        
        for sub_section in section_subsection_tfidf[section]:
            section_tfidf.append(section_subsection_tfidf[section][sub_section])
        
        if len(section_subsection_tfidf[section]) == 1:
            normalized_section_tfidf = []
            normalized_section_tfidf.append(0.0)
        else:
            normalized_section_tfidf = [float(idx) / max(section_tfidf) for idx in section_tfidf]

        for idx, sub_section in enumerate(section_subsection_tfidf[section]):
            normalized_section_subsection_tfidf[section][sub_section] = normalized_section_tfidf[idx]
            
    return normalized_section_subsection_tfidf


In [375]:
# Noramalize TF-IDF sub-section values to be 0.0 - 1.0
normalized_section_subsection_tfidf = noramlizeSubSectionTfidf(section_subsection_tfidf)

In [403]:
# Function to create CountVectorizers and TfidfTransformers for each annotation section's selected text
def generateSectionTextPipelines(section_annotation_df_dict):
    section_selected_text_pipelines_dict = {}
    
    for section, df in section_annotation_df_dict.items():
        section_selected_text_pipelines_dict[section] = Pipeline([('count', CountVectorizer(ngram_range=(1,3), max_features=MAX_ANNOTATION_FEATURES)),('tfidf', TfidfTransformer())]).fit(df['Selected Text'])
    
    return section_selected_text_pipelines_dict


In [521]:
# Create CountVectorizers and TfidfTransformers for each section's selected text
section_texts_pipelines_dict = generateSectionTextPipelines(section_annotation_df_dict)

In [363]:
# Function to generate stem lists for each section's sub-section
def generateStemListsForSubSections(section_annotation_df_dict):
    section_subsection_stem_list = {}
    
    for section in section_annotation_df_dict:
        section_subsection_stem_list[section] = {}

        for sub_section in section_annotation_df_dict[section]['Sub Section'].unique():
            section_subsection_stem_list[section][sub_section] = set()
            
        for index, row in section_annotation_df_dict[section].iterrows():
            stem_tokens = word_tokenize(row['Selected Text'])
            sub_section = row['Sub Section']
            
            for stem in stem_tokens:
                section_subsection_stem_list[section][sub_section].add(stem)
                    
    return section_subsection_stem_list

In [364]:
# Generate stem lists for each section's sub-section
section_subsection_stem_list = generateStemListsForSubSections(section_annotation_df_dict)

In [423]:
# Function to modify section text tfidf scores based on the normalized sub-section tf-idf scores
def computeFinalSectionTextTfidf(section_texts_pipelines_dict, normalized_section_subsection_tfidf, section_subsection_stem_list):
    for section in normalized_section_subsection_tfidf:  
        for idx, term in enumerate(section_texts_pipelines_dict[section]['count'].get_feature_names()):
            for sub_section in section_subsection_stem_list[section]:
                if term in section_subsection_stem_list[section][sub_section]:
                    section_texts_pipelines_dict[section]['tfidf'].idf_[idx] += normalized_section_subsection_tfidf[section][sub_section]
                    

In [426]:
computeFinalSectionTextTfidf(section_texts_pipelines_dict, normalized_section_subsection_tfidf, section_subsection_stem_list)

In [517]:
# Function to generate a dictionary for each feature name and its corresponding (global) tfidf score for each annotation section
def generateFeatureTfidfDict(section_texts_pipelines_dict):
    annotation_section_feature_tfidf_dict = {}
    
    for section in section_texts_pipelines_dict:
        annotation_section_feature_tfidf_dict[section] = {}
        for idx, feature_name in enumerate(section_texts_pipelines_dict[section]['count'].get_feature_names()):
            annotation_section_feature_tfidf_dict[section][feature_name] = section_texts_pipelines_dict[section]['tfidf'].idf_[idx]
            
    return annotation_section_feature_tfidf_dict

In [520]:
# Generate a dictionary for each feature name and its corresponding (global) tfidf score for each annotation section
annotation_section_feature_tfidf_dict = generateFeatureTfidfDict(section_texts_pipelines_dict)

In [495]:
# Function to generate a document-term matrix for each privacy policy section, using the topics extracted from the OPP-115 annotations
def generatePolicySectionDocumentTermMatrices(section_dfs, section_texts_pipelines_dict):
    policy_sections__text_pipelines_dict = {}
    
    for section in section_texts_pipelines_dict:
        if section in section_dfs:
            pipe = Pipeline([('count', CountVectorizer(vocabulary=section_texts_pipelines_dict[section]['count'].get_feature_names())), ('tfidf', TfidfTransformer())]).fit(section_dfs[section]['sentences'])
            policy_sections__text_pipelines_dict[section] = pipe.transform(section_dfs[section]['sentences']).toarray()
        
    return policy_sections__text_pipelines_dict

In [680]:
# Generate a document-term matrix for each privacy policy section using the topics extracted from the OPP-115 annotations
policy_sections__text_pipelines_dict = generatePolicySectionDocumentTermMatrices(section_dfs, section_texts_pipelines_dict)

In [525]:
# Function to weight policy section text tfidf scores based on tfidf scores of topics calculated from the section annotations
def weightPolicySectionTfidfScores(policy_sections__text_pipelines_dict, annotation_section_feature_tfidf_dict):
    for section in policy_sections__text_pipelines_dict:
        for sentence_tfidfs in policy_sections__text_pipelines_dict[section]:
            for idx, topic in enumerate(section_texts_pipelines_dict[section]['count'].get_feature_names()):
                if sentence_tfidfs[idx] > 0.0:
                    sentence_tfidfs[idx] += annotation_section_feature_tfidf_dict[section][topic]

In [683]:
# Weight policy section text tfidf scores based on tfidf scores of topics calculated from the section annotations
weightPolicySectionTfidfScores(policy_sections__text_pipelines_dict, annotation_section_feature_tfidf_dict)

In [684]:
# Function to get the row indices of the top k sentences in each section
def getTopKSectionRowIndices(policy_sections__text_pipelines_dict):
    section_sentence_scores = {}
    section_sentence_scores_sorted = {}
    
    for section in policy_sections__text_pipelines_dict:
        section_sentence_scores[section] = {}
        section_sentence_scores_sorted[section] = []
        
        for idx, sentence_tfidfs_array in enumerate(policy_sections__text_pipelines_dict[section]):
            section_sentence_scores[section][idx] = np.sum(sentence_tfidfs_array, axis=0)
        section_sentence_scores_sorted[section] = sorted(section_sentence_scores[section].items(), key=lambda x: x[1], reverse=True)
        
        if (len(policy_sections__text_pipelines_dict[section]) >= K_SENTENCES):
            section_sentence_scores_sorted[section] = section_sentence_scores_sorted[section][:K_SENTENCES]
        else:
            section_sentence_scores_sorted[section] = section_sentence_scores_sorted[section][:len(policy_sections__text_pipelines_dict[section])]          
                    
    return section_sentence_scores_sorted

In [685]:
# Get the row indices of the top k sentences in each section
section_sentence_scores_sorted = getTopKSectionRowIndices(policy_sections__text_pipelines_dict)

In [686]:
# Prepare a sections dictionary with tokenized sentences that are clean and ready to be extracted
twitch_section_dict_with_case_sentence_tokenized = tokenizeSectionSentences(twitch_section_dict_with_case)
extractable_section_dfs = generateSectionDataFrames(twitch_section_dict_with_case_sentence_tokenized)

In [690]:
# Function to extract and combine the top k sentences in each section
def extractAndCombineTopKSectionSentences(section_sentence_scores_sorted, cleaned_section_dfs):
    section_summaries_dict = {}
    
    for section in section_sentence_scores_sorted:
        section_summaries_dict[section] = "";
        for sentence_idx_score_tuple in section_sentence_scores_sorted[section]:
            section_summaries_dict[section] += ' ' + cleaned_section_dfs[section].iloc[sentence_idx_score_tuple[0], 0]
            
    return section_summaries_dict

In [707]:
# Extract and combine the top k sentences in each section. Print the results.
section_summaries_dict = extractAndCombineTopKSectionSentences(section_sentence_scores_sorted, extractable_section_dfs)

for section in section_summaries_dict:
    print(section + ':')
    print(section_summaries_dict[section][1:])
    print("\n\n")

First Party Collection/Use:
Examples of how Twitch uses automatically collected information include to: (a) automatically update the Twitch application on your system; (b) remember your information so that you will not have to re enter it during your visit or the next time you access the Twitch Services; (c) provide customized advertisements, content, and information; (d) monitor the effectiveness of marketing campaigns; (e) monitor and store aggregate site usage metrics such as total number of visitors and pages accessed; and (f) track your entries, submissions, and status in any promotions or other activities.  Personal Information Twitch Collects User provided Information: You may provide a variety of information about yourself to us, such as your name, email address, postal mailing address, telephone number, credit card number, and billing information when you register for Twitch Services; upload, purchase, view, or download certain content or products from the Twitch Services; ent

In [712]:
# Print out the original privacy policy segments (Headers still need to be filtered out)
for section in twitch_section_dict_with_case_sentence_tokenized:
    section_text = ""
    
    for sentence in twitch_section_dict_with_case_sentence_tokenized[section]:
        section_text += sentence + ' '
    
    print(section + ':')
    print(section_text[1:])
    print("\n\n")

First Party Collection/Use:
Personal Information Twitch Collects User provided Information: You may provide a variety of information about yourself to us, such as your name, email address, postal mailing address, telephone number, credit card number, and billing information when you register for Twitch Services; upload, purchase, view, or download certain content or products from the Twitch Services; enter contests or sweepstakes; or otherwise use the features and functionality of the Twitch Services. Automatically Collected Information: When you access the Twitch Services or open one of our emails, we may automatically record and store certain information about your system by using cookies and other types of technologies. Cookies are small text files containing a string of alphanumeric characters that are sent to your browser. For information about what cookies are, how they work, how Twitch uses them, and how to remove them, please see our Cookie Policy Examples of such information w