## Imports, cloning

In [None]:
# !git clone https://github.com/GlebSolovev/Text-Multi-Style-Transfer-Through-Activation-Maximization.git
# %cd Text-Multi-Style-Transfer-Through-Activation-Maximization

In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import datasets

In [None]:
# !pip install huggingface_hub
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# !git config --global credential.helper store

## Preparing datasets

### Sentiment (Yelp)

In [None]:
yelp = datasets.load_dataset('yelp_review_full')

Reusing dataset yelp_review_full (/root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def yelp_rating_mapping(d):
    d['label'] = 0 if d['label'] < 2 else 1
    return d


dataset_sentiment = yelp \
    .filter(lambda d: d['label'] != 2) \
    .map(yelp_rating_mapping) \
    .shuffle(seed=5)

Loading cached processed dataset at /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-fb0576e73135010b.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-460950a9b2d6680e.arrow


  0%|          | 0/520000 [00:00<?, ?ex/s]

  0%|          | 0/40000 [00:00<?, ?ex/s]

In [None]:
len(dataset_sentiment['train'])

520000

In [None]:
dataset_sentiment['train'][5555]

{'label': 0,
 'text': 'Do not waste your money, I waited a long time to see this, and it was disgusting, perverse, if I wanted that I would have attended the peep show before his. He is a musical genius and I thought I would see the costume changes, a PIANO on stage, he is claiming to be representing Liberace, it was a T & A show, lotta \\"f\\" bombs, his lip syncing Boy George was an awful look alike, his rant after this guys performance made no sense. Money can be better spent elsewhere'}

### Gender (rtGender)

In [None]:
# rtgender = datasets.load_dataset('peixian/rtGender', 'posts') # annotations, posts, responses

In [None]:
# %cd ..

In [None]:
# def gender_mapping(d):
#     d['op_gender'] = 0 if d['op_gender'] == 'M' else 1
#     return d


# dataset_gender = rtgender \
#     .remove_columns(['op_gender_visible', 'op_id', 'post_id', 'post_type', 'source', 'subreddit']) \
#     .filter(lambda d: len(d['post_text']) >= 40) \
#     .map(gender_mapping) \
#     .rename_columns({'op_gender': 'label', 'post_text': 'text'}) \
#     .shuffle(seed=5)


In [None]:
# len(dataset_gender['train'])

In [None]:
# dataset_gender['train'][1234]

### Politics ([IEEE](https://ieee-dataport.org/open-access/usa-nov2020-election-20-mil-tweets-sentiment-and-party-name-labels-dataset)) (TODO)

In [None]:
# TODO

# Classifiers

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import numpy as np

In [None]:
distilbert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def tokenize_function(dataset):
    return distilbert_tokenizer(dataset["text"], padding="max_length", truncation=True)

In [None]:
accuracy_metric = datasets.load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

## Sentiment

In [None]:
# sentiments_train, sentiments_test = dataset_sentiment.train_test_split(0.1) # -- not necessary
sentiment_name = "22s-dl-sentiment-1"

In [None]:
# tokenized_sentiments = dataset_sentiment.map(tokenize_function, batched=True)

In [None]:
# tok_train_sentiments = tokenized_sentiments["train"].shuffle(seed=42).select(range(20000))
# tok_eval_sentiments = tokenized_sentiments["test"].shuffle(seed=42).select(range(30000))

In [None]:
# model_sentiments = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
# training_args_sentiments = TrainingArguments(
#     output_dir=sentiment_name,
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=5,
#     weight_decay=0.01,
# )

In [None]:
# trainer_sentiments = Trainer(
#     model=model_sentiments,
#     args=training_args_sentiments,
#     train_dataset=tok_train_sentiments,
#     eval_dataset=tok_eval_sentiments,
#     tokenizer=distilbert_tokenizer,
#     data_collator=DataCollatorWithPadding(distilbert_tokenizer),
#     compute_metrics=compute_metrics
# )

In [None]:
# trainer_sentiments.train() 

In [None]:
# trainer_sentiments.evaluate()

In [None]:
# trainer_sentiments.push_to_hub()

# Perplexity, attempt 3

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch, tqdm
import numpy as np

In [None]:
class Perplexer:

    def __init__(self, device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
        self._device = device
        model_id = "gpt2-large"
        self._model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
        self._tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

    def calc_perplexity(self, sentences) -> float:
        encodings = self._tokenizer("\n\n".join(sentences), return_tensors="pt")
        max_length = self._model.config.n_positions
        stride = 512

        nlls = []
        for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
            begin_loc = max(i + stride - max_length, 0)
            end_loc = min(i + stride, encodings.input_ids.size(1))
            trg_len = end_loc - i  # may be different from stride on last loop
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self._model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs[0] * trg_len
            nlls.append(neg_log_likelihood)

        ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
        return ppl.item()

In [None]:
ppler = Perplexer()


# test = ['one simple sentence of good quality']
# test = dataset_sentiment['test'].select(range(5000))['text']

# encodings = tokenizer("\n\n".join(test), return_tensors="pt")

In [None]:
ppler.calc_perplexity(dataset_sentiment['test'].select(range(100))['text'])

TypeError: ignored

In [None]:
# ppl

# BLEU

In [None]:
!pip install sacrebleu

In [None]:
bleu_metric = datasets.load_metric('sacrebleu')

In [None]:
bleu_metric.compute(
    predictions=["hello there general kenobi", "foo bar foobar", "ooga booga ooga booga"], 
    references=[["hello there general kenobi"],
                ["foo bar foobar"],
                ["ooga booga ooga booga"]]
)

# Putting it together

In [None]:
from transformers import pipeline

In [None]:
sentiment_model = pipeline(model='DLochmelis33/22s-dl-sentiment-1')

In [None]:
sentiment_model(['bananas are cherries'])

In [None]:
formality_model = pipeline(model='cointegrated/roberta-base-formality')