# New Section

In [2]:
# !update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
# !update-alternatives --list python
# !update-alternatives --help

'update-alternatives' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
# !pip install tensorflow pandas numpy
# !pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

# !pip install category-encoders transformers 
# !pip install datasets nlpaug

# !pip install pandas numpy  --upgrade
# !pip install transformers --upgrade



In [1]:
import torch
assert torch.cuda.is_available()

import os
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder, OneHotEncoder
from joblib import dump
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset

from typing import Tuple, FrozenSet

import nlpaug.flow as naf
import nlpaug.augmenter.word as naw
import zipfile as zf
import shutil

In [3]:
def reduce_subclasses(annotated_texts: pd.DataFrame, verbose: int = 0) -> pd.DataFrame:
    """Reduce number of classes by merging homogenous(no conflicting) subcategories of main classes."""
    n_classes = len(annotated_texts['label'].unique())
    data_size = len(annotated_texts)
    # codes whose subcategories are so similar that they can be disregarded
    homogenous_categories = ['601', '602', '606', '607', '201', '416', '608', '103']

    category_prefixes = annotated_texts['label'].str.extract(r'(\d+)\..', expand=False)
    annotated_texts['label'] = np.where(
        category_prefixes.isin(homogenous_categories),
        category_prefixes,
        annotated_texts['label']
    )

    if verbose > 0:
        print(f"Merged subcategories for {homogenous_categories}\n"
              f"Number of classes: {n_classes} -> {len(annotated_texts['label'].unique())}\n"
              f"Data size: {data_size} -> {len(annotated_texts)}")

    return annotated_texts


def keep_top_k_classes(annotated_texts: pd.DataFrame, k: int, verbose: int = 0) -> pd.DataFrame:
    """Keep only top k most frequent classes, the rest are changed to 000."""
    n_classes = len(annotated_texts['label'].unique())
    top_k_classes = annotated_texts['label'].value_counts().index[:k]
    annotated_texts['label'] = np.where(annotated_texts['label'].isin(top_k_classes), annotated_texts['label'], "000")
    if verbose > 0:
        print(f"Kept top {k} classes: {top_k_classes.to_list()}. Set {n_classes} others to 000")
    return annotated_texts


def random_undersample(annotated_texts: pd.DataFrame, random_state: int = None, verbose: int = 0) -> pd.DataFrame:
    """Random under sample all majority classes"""
    distribution = annotated_texts['label'].value_counts().describe()
    if verbose > 0:
        print(f"Under-sampling to {distribution['min']} samples per class.")
    return annotated_texts.groupby('label').sample(n=int(distribution['min']), random_state=random_state)


def augment(annotated_texts: pd.DataFrame, batch_size: int = 32, max_length: int = 512, device: str = 'cpu',
            verbose: int = 0) -> pd.DataFrame:
    """ Performs text augmentation

    :param annotated_texts:
    :param batch_size:
    :param max_length:
    :param device: 'cpu' or 'cuda'
    :param verbose:
    :return:
    """
    # pipe = naf.Sequential([
    #     # naw.back_translation.BackTranslationAug(max_length=max_length, batch_size=batch_size, verbose=verbose,
    #     #                                         device='cuda'),
    #     naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="insert", batch_size=batch_size,
    #                               verbose=verbose, device='cuda'),
    #     naw.split.SplitAug(aug_p=0.3, min_char=2, verbose=verbose)
    # ])

    # Truncate
    annotated_texts = annotated_texts.copy()
    annotated_texts['text'] = np.where(annotated_texts['text'].str.len() > max_length,
                                       annotated_texts['text'].str[:max_length],
                                       annotated_texts['text'])
    # keep texts with at least two valid tokens
    # annotated_texts = annotated_texts[annotated_texts['text'].str.contains(r'[a-zA-Z0-9]{2,}')]

    # Augment
    pipe = naf.Sequential([
        naf.Sometimes([
            naw.ContextualWordEmbsAug(aug_p=0.3, model_path='distilroberta-base', action="insert",
                                      batch_size=batch_size, verbose=verbose),
            naw.ContextualWordEmbsAug(aug_p=0.3, model_path='distilroberta-base', action="substitute",
                                      batch_size=batch_size, verbose=verbose),
        ]),
        # naw.SynonymAug(aug_p=0.3, verbose=verbose),
        naw.SplitAug(aug_p=0.1, verbose=verbose)
    ])
    pipe.device = device
    annotated_texts['text'] = pipe.augment(annotated_texts['text'].to_list())

    return annotated_texts
    # augmenters = [
    #     naw.ContextualWordEmbsAug(aug_p=0.3, model_path='bert-base-cased', action="insert",
    #                               batch_size=batch_size, verbose=verbose, device='cuda'),
    #     naw.ContextualWordEmbsAug(aug_p=0.3, model_path='bert-base-cased', action="substitution",
    #                               batch_size=batch_size, verbose=verbose, device='cuda'),
    #     naw.split.SplitAug(aug_p=0.3, min_char=2, verbose=verbose)
    # ]
    # results = []
    # for augmenter in augmenters:
    #     result = annotated_texts.copy()
    #     result['text'] = augmenter.augment(result['text'].to_list())
    #     result['text'].str.replace(r"\s'\s", "'", regex=True)
    #     results.append(result)

    # # Merge append augmented data
    # augmented_texts = pd.concat([annotated_texts] + results, ignore_index=True)
    #
    # if verbose >= 1:
    #     i = 0
    #     for pre, post in zip(annotated_texts['text'], results[0]['text']):
    #         print('pre:\n' + pre)
    #         print('post:\n' + post)
    #         i += 1
    #         if i >= 5:
    #             break
    #
    # return augmented_texts

    # augmentation ideas
    # cannot use sentence level augmentations we only have quasi-sentences by themselves
    # contextual embedding substitution, insertion
    # minimal to no random shuffling - it can change the meaning of a sentence
    # decent amount of word splitting - may be a frequent occurrence in scraped text
    # speech style transformations (formal to casual to very casual)
    # insertion of filler words (um, hum, like, i think, yeah, i mean, well, look)
    # abstract summarization - maybe only for examples that are too long
    # use reserved for phrase-to-phrase and phrase-to-word and word-to-phrase replacement -- use websites that do this
    # use augmentation to address class imbalance (augment minority classes first)
    # use an augmentation pipeline


def load_data(countries: FrozenSet[str] = frozenset({'AU', 'CA', 'IE', 'IL', 'NZ', 'SA', 'UK', 'US'}),
              return_raw=False, data_dir=None) -> pd.DataFrame:
    """Load annotated text data from disk and performs basic preprocessing."""

    def read_and_tag_csv(path, country):
        df = pd.read_csv(path)
        df['country'] = country
        return df

    # Load annotated text from MARPOR corpus
    data_dir = data_dir or os.path.join('../datasets', 'MARPOR', 'Annotated text')
    country_data_dirs = {country: os.path.join(data_dir, f'{country} 2001-2021')
                         for country in countries}
    annotated_texts_data = [
        read_and_tag_csv(full_path, country)
        for country, directory in country_data_dirs.items()
        for filename in os.listdir(directory)
        if os.path.isfile(full_path := os.path.join(directory, filename))
    ]
    annotated_texts = pd.concat(annotated_texts_data, axis=0, ignore_index=True)

    if return_raw:
        # Return dataframe without basic preprocessing
        return annotated_texts

    # Basic preprocessing
    annotated_texts = (
        annotated_texts.rename(columns={'cmp_code': 'label'})
        .drop(columns=['eu_code'])
    )
    annotated_texts = annotated_texts[annotated_texts['label'] != 'H']  # drop headings
    annotated_texts['label'] = (
        annotated_texts['label'].astype(str)
        .str.replace('.0', '', regex=False)  # remove redundant suffix
        .str.replace(r'^0$', '000', regex=True)  # political statements without clear category
        .str.replace('nan', 'N/A', regex=False)  # non-political statements
    )
    annotated_texts['text'] = annotated_texts['text'].str.encode('ascii', 'ignore').str.decode('ascii')

    return annotated_texts


def tokenize(examples, tokenizer):
    return tokenizer(examples['text'], padding='max_length', truncation=True)


def ds_to_tf_ds(dataset: Dataset, shuffle: bool = False, batch_size: int = 32,
                target_name: str = 'label', features=None) -> tf.data.Dataset:
    """Convert huggingFace Dataset into Tensorflow Dataset"""
    # Remove text column which should have already been used by the tokenizer and is now redundant
    dataset = dataset.remove_columns(['text']).with_format('tensorflow')  # can we keep text column?
    features = {x: dataset[x] for x in features}
    tf_dataset = tf.data.Dataset.from_tensor_slices((features, dataset[target_name]))
    if shuffle:
        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
    tf_dataset = tf_dataset.batch(batch_size)
    return tf_dataset


def train_eval(pretrained_model: str, annotated_texts: pd.DataFrame = None, max_length: int = 512):
    # Create folders to store results
    model_dir = os.path.join('fine-tuned-models', pretrained_model.replace('/', '-'))
    os.makedirs(model_dir, exist_ok=True)

    # Load data
    annotated_texts = annotated_texts if annotated_texts is not None else load_data()
    annotated_texts = reduce_subclasses(annotated_texts, verbose=1)
    annotated_texts = keep_top_k_classes(annotated_texts, k=22, verbose=1)
    annotated_texts = random_undersample(annotated_texts, random_state=1, verbose=1)

    # Split dataframe into train, validation and test, 6:2:2
    y, X = annotated_texts[['label']], annotated_texts.drop(columns=['label'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=1, test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
    print(len(X_train), 'train examples')
    print(len(X_val), 'validation examples')
    print(len(X_test), 'test examples')
    
    # Augment data
    X_train = augment(X_train, batch_size=32, max_length=max_length, verbose=1)

    # Encode label
    # label_encoder = BinaryEncoder()
    label_encoder = OneHotEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)
    y_val = label_encoder.transform(y_val)
    if not isinstance(y_train, pd.Series) and y_train.shape[1] > 1:
        num_classes = y_train.shape[1]
    else:
        num_classes = len(annotated_texts['label'].unique())

    with open(os.path.join(model_dir, 'label_encoder.joblib'), 'wb') as f:
        dump(label_encoder, f)

    # Load Model and Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=num_classes)

    # Reduce max input token count to save memory at the cost of accuracy
    tokenizer.model_max_length = max_length
    # default to right padding for model with absolute position embeddings
    tokenizer.padding_side = "right"

    # Add special tokens
    special_tokens_dict = {'bos_token': '[BOS]', 'eos_token': '[EOS]', 'pad_token': '[PAD]'}
    tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    # # fix model padding token id
    # model.config.pad_token_id = tokenizer.pad_token
    # model.config.eos_token_id = tokenizer.eos_token

    # Convert to huggingface Dataset
    # train_ds = Dataset.from_pandas(train_df)
    # val_ds = Dataset.from_pandas(val_df)
    # test_ds = Dataset.from_pandas(test_df)
    # Tokenize data
    # train_ds = train_ds.map(lambda x: tokenize(x, tokenizer), batched=True)
    # val_ds = val_ds.map(lambda x: tokenize(x, tokenizer), batched=True)
    # test_ds = test_ds.map(lambda x: tokenize(x, tokenizer), batched=True)
    X_train = tokenizer(X_train['text'].to_list(), padding='max_length', truncation=True, return_tensors='tf').data
    X_val = tokenizer(X_val['text'].to_list(), padding='max_length', truncation=True, return_tensors='tf').data
    X_test = tokenizer(X_test['text'].to_list(), padding='max_length', truncation=True, return_tensors='tf').data

    # Convert to Tensorflow Datasets
    # batch_size = 8
    # train_ds = ds_to_tf_ds(train_ds, shuffle=True, batch_size=batch_size, features=tokenizer.model_input_names)
    # val_ds = ds_to_tf_ds(val_ds, batch_size=batch_size, features=tokenizer.model_input_names)
    # test_ds = ds_to_tf_ds(test_ds, batch_size=batch_size, features=tokenizer.model_input_names)

    # for feature_batch, label_batch in train_ds.take(1):
    #     print('Every feature:', list(feature_batch.keys()))
    #     print('A batch of texts:', feature_batch['text'])
    #     print('A batch of label:', label_batch)

    # USe mixed precision to save memory
    tf.keras.mixed_precision.set_global_policy('mixed_float16')

    # Train classifier, evaluate and save results
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=[tf.metrics.CategoricalAccuracy()],
    )
    # error when using f1_marco
    # NotImplementedError: Cannot convert a symbolic Tensor (tf_roberta_for_sequence_classification/classifier/out_proj/BiasAdd:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported
    model.summary()

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=1,
                                                      restore_best_weights=True)
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath="training_1/cp.ckpt", save_weights_only=True, verbose=1)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=8, epochs=5,
                        callbacks=[early_stopping, cp_callback])
    model.save(model_dir)
    # model = tf.keras.models.load_model(os.path.join('fine-tuned-models', 'from-gpu-cloud', 'distilroberta-base',
    #                                                 'tf_model.h5'))
    scores = model.evaluate(X_test, y_test)

    with open(os.path.join(model_dir, 'train-history.joblib'), 'wb') as logs_file, \
            open(os.path.join(model_dir, 'scores.joblib'), 'wb') as scores_file:
        dump(scores, scores_file)
        print(history)
        dump(history.histroy, logs_file)


def main(annotated_texts: pd.DataFrame = None):
    # cached: EleutherAI/gpt-neo-1.3B, EleutherAI/gpt-neo-2.7B, gpt2-medium, gpt2-large, bert-base-cased
    pretrained_models = ['distilroberta-base', 'roberta-base', 'bert-base']
    for model in pretrained_models:
        train_eval(model, annotated_texts=annotated_texts)


def test_augmentation(annotated_texts: pd.DataFrame):
    df = annotated_texts.copy()
    df = annotated_texts[:10]
    df_augmented = augment(df)

    for before, after in zip(df['text'], df_augmented['text']):
        print(f"Before:\n{before}")
        print(f"After:\n{after}")
        print("-----------------------")

    # Does not work as fastai cannot use multiprocessing on Windows


def unzip_dir(filename):
    import zipfile as zf
    files = zf.ZipFile(filename, 'r')
    files.extractall(filename[:-4])
    files.close()


def zip_dir(source_dir):
    import shutil
    shutil.make_archive(source_dir, 'zip', source_dir)


In [None]:
main()

Merged subcategories for ['601', '602', '606', '607', '201', '416', '608', '103']
Number of classes: 88 -> 70
Data size: 155899 -> 155899
Kept top 22 classes: ['504', 'N/A', '411', '501', '506', '503', '303', '701', '403', '410', '402', '605', '301', '416', '107', '305', '601', '706', '201', '104', '502', '605.1']. Set 70 others to 000
Under-sampling to 2452.0 samples per class.
33837 train examples
11279 validation examples
11280 test examples








NameError: name 'naw' is not defined

pandas.core.frame.DataFrame