In [61]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from collections import Counter
import spacy
import pandas as pd
from spellchecker import SpellChecker
import re


In [62]:
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"

In [63]:
summaries_train = pd.read_csv('data/summaries_train.csv')
prompts_train = pd.read_csv('data/prompts_train.csv')

summaries_train = summaries_train.merge(prompts_train, on='prompt_id')

In [64]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [65]:
spacy.load("en_core_web_sm",)

<spacy.lang.en.English at 0x10c31bd90>

In [66]:
speller = SpellChecker()

In [67]:
random_nr = np.random.randint(0, len(summaries_train))
example1 = summaries_train.loc[random_nr]
example1

student_id                                              dc82762d40b5
prompt_id                                                     ebad26
text               The jungle is a great book which explains diff...
content                                                    -0.627647
wording                                                    -0.125597
prompt_question    Summarize the various ways the factory would u...
prompt_title                                 Excerpt from The Jungle
prompt_text        With one member trimming beef in a cannery, an...
Name: 2816, dtype: object

In [68]:
summary_text = example1["text"]

In [69]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = SpellChecker() #Speller(lang='en')
        
    def count_text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
        """ text length """
        tokenizer=self.tokenizer
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.speller.unknown(wordlist)))

        return amount_miss
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )

        )
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)

        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=model_name)



In [70]:

speller.unknown(example1['text'].split())

# About the spelling correction, we clearly need to separate the words from the punctuation.
# We can use the nltk tokenizer for this.


{'happened.', 'meats.', 'sauage.'}

In [71]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Speller

In [72]:
wordlist = summary_text.split()

In [73]:
import string

def remove_punctuation(word):
    return ''.join(char for char in word if char not in string.punctuation)
#remove punctuation
print(f"size before: {len(wordlist)}")
wordlist = [remove_punctuation(word) for word in wordlist]
# wordlist = [word for word in wordlist if not word.isalpha()]
print(f"size after: {len(wordlist)}")
wordlist

size before: 39
size after: 39


['The',
 'jungle',
 'is',
 'a',
 'great',
 'book',
 'which',
 'explains',
 'different',
 'processes',
 'by',
 'which',
 'violations',
 'of',
 'the',
 'meat',
 'packing',
 'happened',
 'Meat',
 'factory',
 'workers',
 'prep',
 'rotten',
 'meat',
 'and',
 'with',
 'better',
 'good',
 'meats',
 'A',
 'lot',
 'of',
 'the',
 'meats',
 'are',
 'used',
 'to',
 'make',
 'sauage']

In [74]:
speller.unknown(wordlist)

{'sauage'}

## Ngrams

In [75]:
example1["summary_tokens"] = (tokenizer.convert_ids_to_tokens(tokenizer.encode(example1["text"]), skip_special_tokens=True))
example1["prompt_tokens"] = (tokenizer.convert_ids_to_tokens(tokenizer.encode(example1["prompt_text"]), skip_special_tokens=True))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example1["summary_tokens"] = (tokenizer.convert_ids_to_tokens(tokenizer.encode(example1["text"]), skip_special_tokens=True))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example1["summary_tokens"] = (tokenizer.convert_ids_to_tokens(tokenizer.encode(example1["text"]), skip_special_tokens=True))
Token indices sequence length is longer than the specified maximum sequence length for this model (1137 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [79]:
example1

student_id                                              dc82762d40b5
prompt_id                                                     ebad26
text               The jungle is a great book which explains diff...
content                                                    -0.627647
wording                                                    -0.125597
prompt_question    Summarize the various ways the factory would u...
prompt_title                                 Excerpt from The Jungle
prompt_text        With one member trimming beef in a cannery, an...
summary_tokens     [▁The, ▁jungle, ▁is, ▁a, ▁great, ▁book, ▁which...
prompt_tokens      [▁With, ▁one, ▁member, ▁trimming, ▁beef, ▁in, ...
Name: 2816, dtype: object

In [77]:
def ngrams(token, n):
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [88]:
def ngram_co_occurrence(row, n):
    summary_ngrams = set(ngrams(example1["summary_tokens"], n))
    prompt_ngrams = set(ngrams(example1["prompt_tokens"], n))
    common_ngrams = summary_ngrams.intersection(prompt_ngrams)
    return len(common_ngrams), common_ngrams

In [90]:
for i in range(1, 4):
    print(f"n={i}",ngram_co_occurrence(example1, i))

n=1 (17, {'▁to', '▁used', '▁which', '▁with', '▁The', '.', '▁by', '▁meat', '▁a', '▁the', '▁and', '▁workers', '▁is', '▁of', '▁make', '▁factory', '▁great'})
n=2 (5, {'▁the ▁meat', '▁to ▁make', '▁by ▁which', '▁meat ▁and', '▁of ▁the'})
n=3 (0, set())
