# New Section

In [1]:
# For Changing Python interpreter on Google Collabe only - Doesn't work anyway
# !update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
# !update-alternatives --list python
# !update-alternatives --help

In [10]:
# For lambda cloud
# !pip install pandas numpy  --upgrade
# !pip install category-encoders transformers datasets nlpaug ujson wordcloud tensorflow-addons
# !pip install --upgrade numpy pandas tensorflow

In [None]:
import os
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder, OneHotEncoder
from joblib import dump, load
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_metric
from ultils import load_data, reduce_subclasses, keep_top_k_classes, random_undersample, augment, \
    inject_book_reviews, load_annotated_book_reviews

from typing import Tuple, FrozenSet, List

# from datasets import Dataset
import ujson
from tqdm import tqdm
# from wordcloud import WordCloud
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

import nlpaug.flow as naf
import nlpaug.augmenter.word as naw

# from textaugment import Word2vec
# import gensim.downloader as downloader
import datetime
print(f"Import complete {datetime.datetime.now()}")

In [None]:
def reduce_subclasses(annotated_texts: pd.DataFrame, verbose: int = 0) -> pd.DataFrame:
    """Reduce number of classes by merging homogenous(no conflicting) subcategories of main classes."""
    n_classes = len(annotated_texts['label'].unique())
    data_size = len(annotated_texts)
    # codes whose subcategories are so similar that they can be disregarded
    homogenous_categories = ['601', '602', '606', '607', '201', '416', '608', '103']

    category_prefixes = annotated_texts['label'].str.extract(r'(\d+)\..', expand=False)
    annotated_texts['label'] = np.where(
        category_prefixes.isin(homogenous_categories),
        category_prefixes,
        annotated_texts['label']
    )

    if verbose > 0:
        print(f"Merged subcategories for {homogenous_categories}\n"
              f"Number of classes: {n_classes} -> {len(annotated_texts['label'].unique())}\n"
              f"Data size: {data_size} -> {len(annotated_texts)}")

    return annotated_texts


def keep_top_k_classes(annotated_texts: pd.DataFrame, k: int, plus: List[str] = None, other: str = None,
                       verbose: int = 0) -> pd.DataFrame:
    """
    Keep only top k most frequent classes plus specified classes, the rest are changed to a specific class.

    :param annotated_texts:
    :param k: top k classes to keep
    :param plus: also keep these classes
    :param other: change the rest into this class
    :param verbose: set to 1 to show more info
    :return:
    """
    n_classes = len(annotated_texts['label'].unique())
    if plus is None:
        plus = []
    top_classes = [class_ for class_ in annotated_texts['label'].value_counts().index if class_ not in plus][:k] + plus
    annotated_texts['label'] = np.where(annotated_texts['label'].isin(top_classes), annotated_texts['label'], other)

    if verbose > 0:
        additional = f'Plus {plus}.' if plus else ''
        print(f"Kept top {k} classes: {top_classes}. {additional} Set {n_classes} others to {other}")

    return annotated_texts


def random_undersample(annotated_texts: pd.DataFrame, random_state: int = None, verbose: int = 0) -> pd.DataFrame:
    """Random under sample all majority classes"""
    distribution = annotated_texts['label'].value_counts().describe()
    if verbose > 0:
        print(f"Under-sampling to {distribution['min']} samples per class.")
    return annotated_texts.groupby('label').sample(n=int(distribution['min']), random_state=random_state)


def augment(annotated_texts: pd.DataFrame, batch_size: int = 32, max_length: int = 512, device: str = 'cpu',
            verbose: int = 0, drop_original: bool = False) -> pd.DataFrame:
    """ Performs text augmentation

    :param annotated_texts: training data
    :param batch_size:
    :param max_length:
    :param device: 'cpu' or 'cuda'
    :param verbose:
    :param drop_original: set to True to return only augmented data
    :return:
    """
    # pipe = naf.Sequential([
    #     naw.BackTranslationAug(max_length=max_length, batch_size=batch_size, verbose=verbose,
    #                                             device=device),
    #     # naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="insert", batch_size=batch_size,
    #     #                           verbose=verbose, device='cuda'),
    #     naw.split.SplitAug(aug_p=0.3, min_char=2, verbose=verbose)
    # ])

    # Truncate
    augmented_texts = annotated_texts.copy()
    augmented_texts['text'] = np.where(augmented_texts['text'].str.len() > max_length,
                                       augmented_texts['text'].str[:max_length],
                                       augmented_texts['text'])
    # keep texts with at least two valid tokens
    augmented_texts = augmented_texts[augmented_texts['text'].str.contains(r'[a-zA-Z0-9]{2,}')]

    # Augment
    pipe = naf.Sequential([
        naw.ContextualWordEmbsAug(aug_p=0.3, model_path='bert-base-cased', action="insert",
                                  batch_size=batch_size, verbose=verbose, device=device),
        naw.ContextualWordEmbsAug(aug_p=0.3, model_path='bert-base-cased', action="substitute",
                                  batch_size=batch_size, verbose=verbose, device=device),
        # naw.SynonymAug(aug_p=0.3, verbose=verbose),
        naw.SplitAug(aug_p=0.1, verbose=verbose)
    ])
    pipe.device = device
    augmented_texts['text'] = pipe.augment(augmented_texts['text'].to_list())

    if drop_original:
        return augmented_texts
    else:
        return pd.concat([annotated_texts, augmented_texts], axis=0)
    # augmenters = [
    #     naw.ContextualWordEmbsAug(aug_p=0.3, model_path='bert-base-cased', action="insert",
    #                               batch_size=batch_size, verbose=verbose, device='cuda'),
    #     naw.ContextualWordEmbsAug(aug_p=0.3, model_path='bert-base-cased', action="substitution",
    #                               batch_size=batch_size, verbose=verbose, device='cuda'),
    #     naw.split.SplitAug(aug_p=0.3, min_char=2, verbose=verbose)
    # ]
    # results = []
    # for augmenter in augmenters:
    #     result = augmented_texts.copy()
    #     result['text'] = augmenter.augment(result['text'].to_list())
    #     result['text'].str.replace(r"\s'\s", "'", regex=True)
    #     results.append(result)

    # # Merge append augmented data
    # augmented_texts = pd.concat([augmented_texts] + results, ignore_index=True)
    #
    # if verbose >= 1:
    #     i = 0
    #     for pre, post in zip(augmented_texts['text'], results[0]['text']):
    #         print('pre:\n' + pre)
    #         print('post:\n' + post)
    #         i += 1
    #         if i >= 5:
    #             break
    #
    # return augmented_texts

    # augmentation ideas
    # cannot use sentence level augmentations we only have quasi-sentences by themselves
    # contextual embedding substitution, insertion
    # minimal to no random shuffling - it can change the meaning of a sentence
    # decent amount of word splitting - may be a frequent occurrence in scraped text
    # speech style transformations (formal to casual to very casual)
    # insertion of filler words (um, hum, like, i think, yeah, i mean, well, look)
    # abstract summarization - maybe only for examples that are too long
    # use reserved for phrase-to-phrase and phrase-to-word and word-to-phrase replacement -- use websites that do this
    # use augmentation to address class imbalance (augment minority classes first)
    # use an augmentation pipeline


# def augment_book_review_data(reviews: pd.DataFrame, batch_size: int = 32, max_length: int = 512, device: str = 'cpu',
#                              drop_original: bool = False, verbose: int = 0) -> pd.DataFrame:
#     # regularised_substitutes = {
#     #     'book': {'item', 'policy', 'idea', 'notion', 'undertaking', 'trip', 'experience', 'system'},
#     #     'read': {'support', 'change', 'review'},
#     #     'character': {'person', 'candidate', 'case', 'areas'},
#     #     'story': {'proposal', 'plan', 'picture', 'narrative'},
#     #     'author': {'entity', 'writer', 'speaker', 'person', 'company', 'people'},
#     # }
#     # all_reserved_tokens: List[List[str]] = [[k] + list(v) for k, v in regularised_substitutes.items()]
#     # all_reserved_tokens += [[e.capitalize() for e in tokens] for tokens in all_reserved_tokens]
#     # pipe = naf.Sequential([naw.ReservedAug(all_reserved_tokens)])
#     #
#     # augmented_reviews = reviews.copy()
#     # augmented_reviews['text'] = pipe.augment(reviews['text'].to_list())
#     # model = downloader.load('word2vec-google-news-300')
#     # aug_w2v = naw.WordEmbsAug(
#     #     model_type='word2vec',
#     #     # model_path='./GoogleNews-vectors-negative300.bin',
#     #     model=model,
#     #     action="substitute"
#     # )

#     reviews = reviews.copy()
#     reviews['text'] = np.where(reviews['text'].str.len() > max_length, reviews['text'].str[:max_length],
#                                reviews['text'])
#     # keep texts with at least two valid tokens
#     augmented_reviews = reviews[reviews['text'].str.contains(r'[a-zA-Z0-9]{2,}')]

#     aug_contextual = naw.ContextualWordEmbsAug(model_path='distilbert-base-cased', action='substitute', aug_p=0.5,
#                                                batch_size=batch_size, verbose=verbose, device=device)
#     augmented_reviews['text'] = aug_contextual.augment(reviews['text'].to_list())
#     augmented_reviews['text'] = augmented_reviews['text'].str.replace(r"\s'\s", "'", regex=True)

#     # Download Google Word2vec embeddings
#     # model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
#     # t = Word2vec(model=model)
#     # augmented_reviews = reviews.copy()
#     # augmented_reviews['text'] = [t.augment(text) for text in augmented_reviews['text']]
#     # augmented_reviews['text'] = augmented_reviews['text'].str.replace(r"\s'\s", "'", regex=True)

#     if drop_original:
#         return augmented_reviews
#     else:
#         return pd.concat([reviews, augmented_reviews], axis=0)


# def make_word_cloud(texts: str, filename: str = 'word-cloud.png') -> None:
#     cloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', width=800,
#                       height=400)
#     cloud.generate(texts)
#     cloud.to_image()
#     cloud.to_file(filename)


def load_data(countries: FrozenSet[str] = frozenset({'AU', 'CA', 'IE', 'IL', 'NZ', 'SA', 'UK', 'US'}),
              return_raw=False, data_dir=os.path.join('..', 'datasets', 'MARPOR', 'Annotated text')) -> pd.DataFrame:
    """Load annotated text data from disk and performs basic preprocessing."""

    def read_and_tag_csv(path, country):
        df = pd.read_csv(path)
        df['country'] = country
        return df

    # Load annotated text from MARPOR corpus
    country_data_dirs = {country: os.path.join(data_dir, f'{country} 2001-2021')
                         for country in countries}
    annotated_texts_data = [
        read_and_tag_csv(full_path, country)
        for country, directory in country_data_dirs.items()
        for filename in os.listdir(directory)
        if os.path.isfile(full_path := os.path.join(directory, filename))
    ]
    annotated_texts = pd.concat(annotated_texts_data, axis=0, ignore_index=True)

    if return_raw:
        # Return dataframe without basic preprocessing
        return annotated_texts

    # Basic preprocessing
    annotated_texts = (
        annotated_texts.rename(columns={'cmp_code': 'label'})
        .drop(columns=['eu_code'])
    )
    annotated_texts = annotated_texts[annotated_texts['label'] != 'H']  # drop headings
    annotated_texts['label'] = (
        annotated_texts['label'].astype(str)
        .str.replace('.0', '', regex=False)  # remove redundant suffix
        .str.replace(r'^0$', '000', regex=True)  # political statements without clear category
        .str.replace('nan', 'N/A', regex=False)  # non-political statements
    )
    annotated_texts['text'] = annotated_texts['text'].str.encode('ascii', 'ignore').str.decode('ascii')

    return annotated_texts


def load_annotated_book_reviews(file_path=os.path.join('..', 'datasets', 'non-political-texts',
                                                       'goodreads_reviews_spoiler.json')) -> pd.DataFrame:
    """Load goodreads spoilers book review data in appropriate format for classifier."""
    # Load data
    with open(file_path, 'r') as f:
        reviews = [ujson.loads(line.rstrip()) for line in tqdm(f)]  # loads as dict from some reason
    reviews = pd.DataFrame.from_records(reviews)

    # Transform to conform to input format
    reviews = reviews.rename(columns={'review_sentences': 'text'})
    reviews = reviews[['text']].explode('text')
    reviews['text'] = reviews['text'].str[1]
    reviews['label'] = 'N/A'

    # Basic preprocessing
    reviews = reviews.dropna(subset=['text', 'label'], how='any')
    reviews['text'] = reviews['text'].str.encode('ascii', 'ignore').str.decode('ascii')

    return reviews[['text', 'label']]


def inject_book_reviews(reviews: pd.DataFrame, annotated_texts: pd.DataFrame, multiplier: float = 1.0) -> pd.DataFrame:
    """Add book review data as N/A labelled rows."""
    current_size = len(annotated_texts[annotated_texts['label'] == 'N/A'])
    injection_size = min(len(reviews), int(multiplier * current_size))
    injection_df = reviews.sample(injection_size)
    return pd.concat([annotated_texts, injection_df], axis=0, ignore_index=True)


metric = load_metric("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize(examples, tokenizer):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_tensors='pt')


def ds_to_tf_ds(dataset: Dataset, shuffle: bool = False, batch_size: int = 32,
                target_name: str = 'label', features=None) -> tf.data.Dataset:
    """Convert huggingFace Dataset into Tensorflow Dataset"""
    # Remove text column which should have already been used by the tokenizer and is now redundant
    dataset = dataset.remove_columns(['text']).with_format('tensorflow')  # can we keep text column?
    features = {x: dataset[x] for x in features}
    tf_dataset = tf.data.Dataset.from_tensor_slices((features, dataset[target_name]))
    if shuffle:
        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
    tf_dataset = tf_dataset.batch(batch_size)
    return tf_dataset


def df_to_dataset(dataframe: pd.DataFrame) -> tf.data.Dataset:
    """A utility method to create a tf.data dataset from a Pandas Dataframe"""
    dataframe = dataframe.copy()
    labels = dataframe.pop('label')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    return ds


def train_eval(X_train, y_train, X_val, y_val, X_test, y_test, pretrained_model: str, num_classes: int,
               max_length: int = 512):
    # Create folders to store results
    model_dir = os.path.join('fine-tuned-models', pretrained_model.replace('/', '-'))
    os.makedirs(model_dir, exist_ok=True)

    # Load Model and Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=num_classes)

    # Reduce max input token count to save memory at the cost of accuracy
    tokenizer.model_max_length = max_length
    # default to right padding for model with absolute position embeddings
    tokenizer.padding_side = "right"

    # Add special tokens
    special_tokens_dict = {'bos_token': '[BOS]', 'eos_token': '[EOS]', 'pad_token': '[PAD]'}
    tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    # Add special tokens
    # tokenizer.pad_token = tokenizer.eos_token
    # model.resize_token_embeddings(len(tokenizer))
    # # fix model padding token id
    # model.config.pad_token_id = tokenizer.pad_token

    # Convert to huggingface Dataset
    train_ds = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1))
    val_ds = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1))
    test_ds = Dataset.from_pandas(pd.concat([X_test, y_test], axis=1))

    # Tokenize data
    train_ds = train_ds.map(lambda x: tokenize(x, tokenizer), batched=True).shuffle(42)
    val_ds = val_ds.map(lambda x: tokenize(x, tokenizer), batched=True).shuffle(42)
    test_ds = test_ds.map(lambda x: tokenize(x, tokenizer), batched=True).shuffle(42)

    # Convert to Tensorflow Datasets
    # batch_size = 8
    # train_ds = ds_to_tf_ds(train_ds, shuffle=True, batch_size=batch_size, features=tokenizer.model_input_names)
    # val_ds = ds_to_tf_ds(val_ds, batch_size=batch_size, features=tokenizer.model_input_names)
    # test_ds = ds_to_tf_ds(test_ds, batch_size=batch_size, features=tokenizer.model_input_names)

    training_args = TrainingArguments(
        model_dir,
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        evaluation_strategy="step",
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model()
    # trainer.save_metrics()
    scores = trainer.evaluate(test_ds)

    with open(os.path.join(model_dir, 'train-history.joblib'), 'wb') as logs_file, \
            open(os.path.join(model_dir, 'scores.joblib'), 'wb') as scores_file:
        dump(scores, scores_file)

In [5]:
annotated_texts = load_data(data_dir=os.path.join('..', 'datasets', 'MARPOR', 'Annotated text'))
print("annotated_texts")
annotated_texts.head()
annotated_texts_ = annotated_texts.copy(deep=True)
# annotated_texts = annotated_texts_.copy(deep=True)

annotated_texts


In [6]:
reviews = load_annotated_book_reviews(file_path=os.path.join('..', 'datasets', 'non-political-texts', 'goodreads_reviews_spoiler.json'))
print("reviews")
reviews.head()
reviews_ = reviews.copy(deep=True)
# reviews = reviews_.copy(deep=True)

1378033it [00:29, 47128.31it/s]


reviews


In [18]:
max_length = 512  # set max_length to 512 if gpu has more memory else set to 256

try:
    # Load cached data
    X_train = pd.read_csv(os.path.join('cache', 'X_train.csv'), index_col=None).fillna('')
    y_train = pd.read_csv(os.path.join('cache', 'y_train.csv'), index_col=None).fillna('')
    X_val = pd.read_csv(os.path.join('cache', 'X_val.csv'), index_col=None).fillna('')
    y_val = pd.read_csv(os.path.join('cache', 'y_val.csv'), index_col=None).fillna('')
    X_test = pd.read_csv(os.path.join('cache', 'X_test.csv'), index_col=None).fillna('')
    y_test = pd.read_csv(os.path.join('cache', 'y_test.csv'), index_col=None).fillna('')

    num_classes = len(y_train['label'].unique())

except (FileNotFoundError, EOFError):
    annotated_texts = annotated_texts.dropna(how='any')
    annotated_texts = inject_book_reviews(reviews, annotated_texts)
    annotated_texts = reduce_subclasses(annotated_texts, verbose=1)
    annotated_texts = keep_top_k_classes(annotated_texts, k=20, plus=['N/A'], other='000', verbose=1)
    annotated_texts = random_undersample(annotated_texts, random_state=1, verbose=1)

    num_classes = len(annotated_texts['label'].unique())

    # Split dataframe into train, validation and test, 6:2:2
    y, X = annotated_texts[['label']], annotated_texts.drop(columns=['label'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=1, test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
    print(len(X_train), 'train examples')
    print(len(X_val), 'validation examples')
    print(len(X_test), 'test examples')

    # Augment text
    train_df = pd.concat([X_train, y_train], axis=1)
    train_df = augment(train_df, batch_size=4096, max_length=max_length, device='cuda', verbose=1)
    y_train, X_train = train_df[['label']], train_df.drop(columns=['label'])
    assert len(X_train) == len(y_train)

    # Cache preprocessed data
    cache_dir = os.path.join('cache')
    os.makedirs(cache_dir, exist_ok=True)
    X_train.to_csv(os.path.join(cache_dir, 'X_train.csv'), index=False)
    y_train.to_csv(os.path.join(cache_dir, 'y_train.csv'), index=False)
    X_val.to_csv(os.path.join(cache_dir, 'X_val.csv'), index=False)
    y_val.to_csv(os.path.join(cache_dir, 'y_val.csv'), index=False)
    X_test.to_csv(os.path.join(cache_dir, 'X_test.csv'), index=False)
    y_test.to_csv(os.path.join(cache_dir, 'y_test.csv'), index=False)

    # cached: EleutherAI/gpt-neo-1.3B, EleutherAI/gpt-neo-2.7B, gpt2-medium, gpt2-large, bert-base-cased
    # 'distilroberta-base', 'roberta-base', 'xlnet-base-cased', 'albert-xlarge-v2',
    pretrained_models = ['EleutherAI/gpt-neo-1.3B']
    for pretrained_model in pretrained_models:
        train_eval(X_train, y_train, X_val, y_val, X_test, y_test, pretrained_model, num_classes=num_classes,
                   max_length=max_length)


In [16]:
# zip_dir(os.path.join('../datasets', 'MARPOR', 'Annotated text'))
# zip_dir(os.path.join('../datasets', 'non-political-texts'))
# unzip_dir('Annotated text.zip')
# unzip_dir('non-political-texts.zip')

In [17]:
# annotated_texts = annotated_texts_.copy(deep=True)
# reviews = reviews_.copy(deep=True)