# Linguistic Analysis of the Fine-tuning Dataset

# Preparations

In [None]:
# installations and downloads
!pip install datasets matplotlib transformers spacy numpy pandas eng-spacysentiment
!python -m spacy download en_core_web_sm



Traceback (most recent call last):
  File "c:\Users\line\.conda\envs\myenv1\lib\runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "c:\Users\line\.conda\envs\myenv1\lib\runpy.py", line 146, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "c:\Users\line\.conda\envs\myenv1\lib\runpy.py", line 110, in _get_module_details
    __import__(pkg_name)
  File "c:\Users\line\.conda\envs\myenv1\lib\site-packages\spacy\__init__.py", line 15, in <module>
    from .cli.info import info  # noqa: F401
  File "c:\Users\line\.conda\envs\myenv1\lib\site-packages\spacy\cli\__init__.py", line 3, in <module>
    from ._util import app, setup_cli  # noqa: F401
  File "c:\Users\line\.conda\envs\myenv1\lib\site-packages\spacy\cli\_util.py", line 9, in <module>
    import typer
  File "c:\Users\line\.conda\envs\myenv1\lib\site-packages\typer\__init__.py", line 12, in <module>
    from click.termui import get_t

In [130]:
# imports

from datasets import load_dataset
import matplotlib.pyplot as plt
import spacy
import eng_spacysentiment
import numpy as np
import pandas as pd
from collections import Counter
from ast import literal_eval
from scipy.stats import wilcoxon

# Load Dataset

In [2]:
# load the dataset
dataset = load_dataset("CarperAI/openai_summarize_tldr")
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'label'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['prompt', 'label'],
        num_rows: 6553
    })
    valid: Dataset({
        features: ['prompt', 'label'],
        num_rows: 6447
    })
})

In [3]:
# Example entry containing a prompt (reddit category, title, post, backstory, ...) and a label (summary)
dataset['train'][0]

{'prompt': "SUBREDDIT: r/relationships\nTITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting\nPOST: Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3 years now and I've gotten better after counselling and mild anti depressants. My mothe

In [4]:
# split test and train in dataframes with prompt and label column
train = pd.DataFrame(dataset['train'])
test = pd.DataFrame(dataset['test'])

prompts_train = train['prompt']
prompts_test = test['prompt']

# Load Dataset Cleaner

In [None]:
# Spacy cleaner for datasets
# adapted from https://gist.github.com/omri374/ec1c243a5a94a657dae40078d47977b6

import re
from typing import List

import spacy
from spacy.tokens import Doc
from tqdm import tqdm

blacklist = ["SUBREDDIT", "TITLE", "POST", "TL;DR", "r"]

class SpacyPreprocessor:
    def __init__(
        self,
        spacy_model=None,
        remove_numbers=False,
        remove_special=False,
        pos_to_remove=None,
        remove_stopwords=True,
        lemmatize=False,
    ):
        """
        Preprocesses text using spaCy
        :param remove_numbers: Whether to remove numbers from text
        :param remove_stopwords: Whether to remove stopwords from text
        :param remove_special: Whether to remove special characters (including numbers)
        :param pos_to_remove: list of PoS tags to remove
        :param lemmatize:  Whether to apply lemmatization
        """

        self._remove_numbers = remove_numbers
        self._pos_to_remove = pos_to_remove
        self._remove_stopwords = remove_stopwords
        self._remove_special = remove_special
        self._lemmatize = lemmatize

        if not spacy_model:
            self.model = spacy.load("en_core_web_sm")
        else:
            self.model = spacy_model

    @staticmethod
    def download_spacy_model(model="en_core_web_sm"):
        print(f"Downloading spaCy model {model}")
        spacy.cli.download(model)
        print(f"Finished downloading model")

    @staticmethod
    def load_model(model="en_core_web_sm"):
        return spacy.load(model, disable=["ner", "parser"])

    def tokenize(self, text) -> List[str]:
        """
        Tokenize text using a spaCy pipeline
        :param text: Text to tokenize
        :return: list of str
        """
        doc = self.model(text)
        return [token.text for token in doc]

    def preprocess_text(self, text) -> str:
        """
        Runs a spaCy pipeline and removes unwanted parts from text
        :param text: text string to clean
        :return: str, clean text
        """
        doc = self.model(text)
        return self.__clean(doc)

    def preprocess_text_list(self, texts=List[str]) -> List[str]:
        """
        Runs a spaCy pipeline and removes unwantes parts from a list of text.
        Leverages spaCy's `pipe` for faster batch processing.
        :param texts: List of texts to clean
        :return: List of clean texts
        """
        clean_texts = []
        for doc in tqdm(self.model.pipe(texts)):
            clean_texts.append(self.__clean(doc))

        return clean_texts

    def __clean(self, doc: Doc) -> str:

        tokens = []
        # POS Tags removal
        if self._pos_to_remove:
            for token in doc:
                if token.pos_ not in self._pos_to_remove:
                    tokens.append(token)
        else:
            tokens = doc

        # Remove Numbers
        if self._remove_numbers:
            tokens = [
                token for token in tokens if not (token.like_num or token.is_currency)
            ]


        # Remove Stopwords
        if self._remove_stopwords:
            tokens = [token for token in tokens if (not token.is_stop)]
            tokens = [token for token in tokens if str(token) not in blacklist] # Own Adaptation: remove captions (TITLE, POST, ...)

            # Own Adaptation: Remove reddit category;  # TODO: does not work bc 'r / category' recognized as separate tokens :(
            # tokens = [token for token in tokens if not "r/" in str(token)]
            # print(tokens)

        # remove unwanted tokens like punctuation
        tokens = [
            token
            for token in tokens
            if not (
                token.is_punct or token.is_space or token.is_quote or token.is_bracket
            )
        ]

        # Remove empty tokens
        tokens = [token for token in tokens if token.text.strip() != ""]

        # Lemmatize
        if self._lemmatize:
            text = " ".join([token.lemma_ for token in tokens])
        else:
            text = " ".join([token.text for token in tokens])

        if self._remove_special:
            # Remove non alphabetic characters
            text = re.sub(r"[^a-zA-Z\']", " ", text)

        # remove non-Unicode characters
        text = re.sub(r"[^\x00-\x7F]+", "", text)

        # lower case
        text = text.lower()

        return text

In [None]:
# load cleaner
spacy_model = SpacyPreprocessor.load_model()
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=False)

# Language Analyser

in: prompts

out: one large DF with prompt_id, lemmata, pos, sentiment score and sentence length (1 row per sentence)

In [None]:
# Function for Language Analysis

# load spacy extention in special nlp variable
nlp_sentim = eng_spacysentiment.load()

# load english
nlp = spacy.load("en_core_web_sm")


# calculate average sentence length over all prompts in a data set (train, test)
def language_analyser(prompts):

  df_sent = pd.DataFrame(columns = ['prompt_id', 'sentence', 'tokens', 'lemmata', 'pos_tags', 'sent_len', 'sentiment_pos', 'sentiment_neg', 'sentiment_neutr'])

  num_prompt = 0

  for p in prompts:

    if num_prompt % 100 == 0:
      print("Analyzed prompts:", num_prompt, "of", len(prompts))

    # instantiate small df 'new row'
    new_row = pd.DataFrame(columns = ['prompt_id', 'sentence', 'tokens', 'lemmata', 'pos_tags', 'sent_len', 'sentiment_pos', 'sentiment_neg', 'sentiment_neutr'])
    num_sent = 0

    # prompt in spacy doc
    doc = nlp(p)

    # segment sentences in prompt
    for sent in doc.sents:

      # save prompt id in df
      new_row.at[num_sent, 'prompt_id'] = num_prompt

      # write sentence text
      new_row.at[num_sent, 'sentence'] = sent.text

      # sentence length
      sent_len = sent.end - sent.start # sent_len = pos of last token in sentence - pos of first token in sentence
      new_row.at[num_sent, 'sent_len'] = sent_len

      # sentiment analysis with spacy extension on each sentence in prompt
      doc_sentim = nlp_sentim(sent.text)
      new_row.at[num_sent, 'sentiment_pos'] = doc_sentim.cats['positive']
      new_row.at[num_sent, 'sentiment_neg'] = doc_sentim.cats['negative']
      new_row.at[num_sent, 'sentiment_neutr'] = doc_sentim.cats['neutral']

      # clean sentence from stopwords, punctuation, ...
      cleaned_sent = preprocessor.preprocess_text(sent.text)
      # cleaned sentence in spacy doc
      cleaned_sent_doc = nlp(cleaned_sent)

      # analyse grammar (lemma, pos)
      sent_tokens = []
      sent_lemma = []
      sent_pos = []

      for token in cleaned_sent_doc:
        sent_tokens.append(token.text)
        sent_lemma.append(token.lemma_)
        sent_pos.append(token.pos_)

      new_row.at[num_sent, 'tokens'] = sent_tokens
      new_row.at[num_sent, 'lemmata'] = sent_lemma
      new_row.at[num_sent, 'pos_tags'] = sent_pos

      num_sent += 1

    # add sentence info as df 'new row' to 'df_sent'
    df_sent = pd.concat([df_sent, new_row], ignore_index = True)
    num_prompt += 1

  print("Analysis completed")

  return df_sent

In [None]:
# analyse train and test prompts with the above function
analysis_train = language_analyser(prompts_train)
analysis_test = language_analyser(prompts_test)

Analyzed prompts: 0 of 116722


KeyboardInterrupt: 

In [None]:
# save df in csv to use it to choose matching and mismatching test prompts (02_choice_match_mismatch.csv)
analysis_train.to_csv('ling_analysis_train.csv', index=False)
analysis_test.to_csv('ling_analysis_test.csv', index=False)