In [9]:
from f_sampling import *
import f_drop
import csv
nlp = spacy.load("en_core_web_sm")
import importlib
import pickle
from random import sample
from collections import defaultdict


### For reloading specific modules after change (necessary with notbooks when modifying imported modules)

In [2]:

importlib.reload(f_sampling)

NameError: name 'f_sampling' is not defined

## Drop rate test 1: based on word choice



In [96]:
def drop_rate_term(data, use_sample = 0, filter_dict = 0):
    """
    Given list of data items with text and summary fields and an optional sample number,\n
    returns a dictionary of statistics for every word present in the data, filtered for words\n
    that appear [filter] or more times in the given dataset. 
    """

    if use_sample:
        data = sample(data, use_sample)

    num_items = len(data)
    text_vocab = defaultdict(int)   # all word occurences in text
    summ_vocab = defaultdict(int)   # all word occurences in summaries
    data_freq = defaultdict(int)    # number of items containing word (text or summary)
    doc_freq = defaultdict(int)     # number of documents containing each word
    summ_freq = defaultdict(int)    # number of summaries containing each word
    doc_term_freq = defaultdict(int)    # sum of term frequencies for each document
    summ_term_freq = defaultdict(int)   # sum of term frequencies for each summary
    term_drop = defaultdict(int)    # all occurences of word being dropped in summary
    drop_freq = defaultdict(int)    # number of documents where word is dropped
    term_add = defaultdict(int)     # all occurences of word being added in summary
    add_freq = defaultdict(int)     # number of documents where word is added   

    for item in data:
        text, summ = nlp(item["text"]), nlp(item["summary_text"])
        
        word_appears = defaultdict(lambda: False)

        # first index for counting occurences, second for knowing if word already seen
        curr_summ_vocab = defaultdict(int)
        # first index for counting occurences, second for word taken
        curr_text_vocab = defaultdict(int)

        summ_total_words = 0
        text_total_words = 0

        # current summary vocab
        for word in summ:
            if not word.is_punct:
                w = word.lemma_
                curr_summ_vocab[w] += 1 # increment current summ vocab
                summ_vocab[w] += 1  # increment global summ vocab
                summ_total_words += 1   
        
        # current document vocab
        for word in text:
            if not word.is_punct:
                w = word.lemma_
                curr_text_vocab[w] += 1 # increment current text vocab
                text_vocab[w] += 1  # increment global text vocab
                text_total_words += 1

        # current document term frequencies
        for key in curr_text_vocab.keys():
            curr_key_freq = curr_text_vocab[key]/text_total_words   # calculate term frequency in text
            doc_term_freq[key] += curr_key_freq # increment global term frequency
            doc_freq[key] += 1 # increment number of documents containing word
            # word was dropped in summary
            if key not in curr_summ_vocab:
                term_drop[key] += curr_text_vocab[key]  # increment drop occurences for term with total occurences in text
                drop_freq[key] += 1 # increment number of documents where word dropped
            if not word_appears[key]:
                data_freq[key] += 1
                word_appears[key] = True

        # current summary term frequencies
        for key in curr_summ_vocab.keys():
            curr_key_freq = curr_summ_vocab[key]/summ_total_words   # calculate term frequency in summary
            summ_term_freq[key] += curr_key_freq    # increment global term frequency
            summ_freq[key] += 1 # increment number of summaries containing word
            # word was added in summary
            if key not in curr_text_vocab:
                term_add[key] += curr_summ_vocab[key]  # increment add occurences for this term with total summary occurences
                add_freq[key] += 1 # increment number of documents where word added
            if not word_appears[key]:
                data_freq[key] += 1
                word_appears[key] = True

 
    res = {}
    final_vocab = text_vocab | summ_vocab
    for word in final_vocab.keys():
        res[word] = {}
        res[word]["corpus frequency"] = data_freq[word]
        res[word]["document frequency"] = doc_freq[word]
        res[word]["relative document frequency"] = doc_freq[word]/num_items
        res[word]["summary frequency"] = summ_freq[word]
        res[word]["relative summary frequency"] = summ_freq[word]/num_items
        res[word]["document term frequency"] = doc_term_freq[word]/num_items
        res[word]["summary term frequency"] = summ_term_freq[word]/num_items
        if text_vocab[word]: 
            res[word]["term drop rate"] = term_drop[word]/text_vocab[word]
        else:
            res[word]["term drop rate"] = "n/a"
        if doc_freq[word]:
            res[word]["drop rate frequency"] = drop_freq[word]/doc_freq[word]
        else:
            res[word]["drop rate frequency"] = "n/a"
        if summ_vocab[word]:
            res[word]["term add rate"] = term_add[word]/summ_vocab[word]
        else:
            res[word]["term add rate"] = "n/a"
        if num_items-doc_freq[word]:
            res[word]["add rate frequency"] = add_freq[word]/(num_items-doc_freq[word])
        else:
            res[word]["add rate frequency"] = "n/a"

    if filter_dict:
        res = dict(filter(lambda elem: elem[1]["corpus frequency"] >= filter_dict, res.items()))

    return res


In [84]:
def drop_print_helper(x, orderby):
    y = x[1][orderby]
    if y != "n/a":
        return y
    else:
        return 0

def drop_print(drop_dict, name, method="pickle", orderby="term drop rate", rev=True, filterd=0):

    if filterd:
        drop_dict = dict(filter(lambda elem: elem[1]["corpus frequency"] >= filterd, drop_dict.items()))

    if method == "pickle":
        with open(f"samples/{name}-f{filterd}", "w") as f:
            pickle.dump(drop_dict, f)

    if method == "csv":
        with open(f"samples/{name}-f{filterd}.csv", "w", encoding="utf-8") as f:
            writer = csv.writer(f)
            ordered_kv = sorted(drop_dict.items(), key=lambda x: drop_print_helper(x, orderby), reverse=rev)
            writer.writerow(["term"] + list(ordered_kv[0][1]))
            for key, value in ordered_kv:
                writer.writerow([key] + list(value.values()))
                
    
        


In [94]:
xsumpresummext = fopen("xsum-presummext-all")
xsumpresummext_res = drop_rate_term(xsumpresummext)

In [95]:
drop_print(xsumpresummext_res, f"drop-xsum-presummext-{len(xsumpresummext)}", method="csv", filterd=20)

## Drop rate test 2: based on sentence position (unfinished, first try did not work)

In [None]:
def drop_rate_position(data, num_sentences = 30):
    sentence_pos = defaultdict(lambda: [0 for _ in range(num_sentences)])
    
    for item in data:
        text, summ = nlp(item["text"]), nlp(item["summary_text"])

        for i,sentence in enumerate(text.sents):
            
            for word in sentence:
                if not word.ispunct:
                    w = word.lemma_
    pass

    return sentence_pos
          
k = drop_rate_position(xsumpeg[:5], num_sentences=10)
