In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset

import torch
from transformers import AutoTokenizer, AutoModel, BertTokenizer, AutoModelForSequenceClassification, pipeline, trainer

tokenizer = AutoTokenizer.from_pretrained('ernie-1.0')
# model = AutoModelForSequenceClassification.from_pretrained('ernie-1.0')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
max_len = 200

# raw_datasets = load_dataset("imdb")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=200, padding='max_length', return_tensors = 'pt')

# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
chn_train = load_dataset('seamew/ChnSentiCorp')
# BATCH=32
# MAX_SEQLEN=300
# LR=5e-5
# EPOCH=10

In [None]:
def process_data(data):
    # max length is about 306..., truncation and padding are needed
    return tokenizer(data["text"], max_length=300, padding='max_length', truncation=True)


In [None]:
tokenized_datasets = chn_train.map(process_data)

In [None]:
# remove the select if want to train on the full set
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(960))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(120))
# test_set = tokenized_datasets["dev"].shuffle(seed=42)

this is the first training part

In [None]:
from transformers import Trainer, TrainingArguments

# the original not finetuned model, num label depends on the data labels
model = AutoModelForSequenceClassification.from_pretrained('ernie-1.0', num_labels=2)

In [None]:
# set eval steps to larger (100) if train on the full set
training_args = TrainingArguments('finetune_trainer',
                                  evaluation_strategy="steps",
                                  logging_dir='logs/',
                                  warmup_steps=50,
                                  eval_steps=10,
                                  logging_steps=20,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  save_total_limit = 3,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="accuracy" # check???
                                  )

In [None]:
from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [None]:
torch.cuda.empty_cache()

In [None]:
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
model.save_pretrained("Ernie-finetuned")

In [None]:
trainer.evaluate()

In [None]:
test_set = tokenized_datasets["validation"].shuffle(seed=42).select(range(120))

In [None]:
testor = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=test_set,
    compute_metrics=compute_metrics,
)

In [None]:
testor.evaluate()

In [None]:
# load from fine-tuned model
from transformers import Trainer, TrainingArguments

# batch_size depends on the GPU memory

training_args = TrainingArguments('finetune_trainer',
                                  evaluation_strategy='steps',
                                  eval_steps=100,
                                  num_train_epochs=5,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  metric_for_best_model="accuracy" # check???
                                  )

In [None]:
torch.cuda.empty_cache()
new_model = AutoModelForSequenceClassification.from_pretrained("finetune_trainer/checkpoint-500")
new_model.to(device)

In [None]:
evaluator = Trainer(
    model=new_model,
    train_dataset=small_train_dataset_danmu,
    eval_dataset=small_eval_dataset_danmu,
    compute_metrics=compute_metrics,
    args=training_args
)


In [None]:
evaluator.evaluate()


In [None]:
# best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
# for n, v in best_run.hyperparameters.items():
#     setattr(trainer.args, n, v)
#
# trainer.train()

Try for other models
Is it possible to label the text with votes from different models?
Or, only judge whether the emotion is extreme on average
Or, predict hotness with the # of extreme emotion

Or, try to label the dan mu with emotion discovering?

In [None]:
def compute_metrics_hotness(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    score = np.max(predictions)
    return metric.compute(predictions=predictions, references=labels)

The following part is about danmu data prrocessing, having some problem...

In [None]:
# tokenized_datasets = tokenized_datasets.apply(lambda x: pd.Series([x.attention_mask, x.token_type_ids, x.input_ids], index=['attention_mask', 'token_type_ids', 'input_ids']))

In [None]:
# small_train_dataset_danmu = tokenized_datasets[: 800]
# small_eval_dataset_danmu = tokenized_datasets[800 :]

This part is for all kind of data -> DataSet frame

In [None]:
danmuku = pd.read_csv('test_dataframe.csv', nrows=200)

In [None]:
# def process_csv(data):
#     temp = tokenizer(data['text'], max_length=300, padding='max_length', truncation=True)
#     return temp.attention_mask, temp.token_type_ids, temp.input_ids
# tokenized_datasets = danmuku.apply(process_data, axis=1)
danmuku['label'] = danmuku['label'].apply(lambda x: 1 if x < 0 else x)

In [None]:
sum_ = 0
for i in danmuku['label']:
    if i == 0:
        sum_+=1
print(sum_)

In [None]:
# if containing other columns other than ['text', 'label'] remove them or create new one with only two
# here I fabricated the labels to be 1 for all dataset

danmuku = danmuku.loc[:, ~danmuku.columns.str.contains('^Unnamed')]
#

In [None]:
# core function that convert pandas to dataset
from datasets import Dataset
danmuku_dataset = Dataset.from_pandas(danmuku)

In [None]:
danmuku_dataset

This part is training on the danmu label using existing model

In [None]:
tokenized_danmuku_dataset = danmuku_dataset.map(process_data)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("Ernie-finetuned", num_labels=2)

In [None]:
# the training argument is above, set batch size to smaller value to view the loss steps
testor = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_danmuku_dataset.shuffle(seed=42).select(range(160)),
    eval_dataset=tokenized_danmuku_dataset.shuffle(seed=42).select(range(160, 200)),
    compute_metrics=compute_metrics,
)

In [None]:
testor.evaluate()

In [None]:
torch.cuda.empty_cache()
model.to(device)
testor.train()

In [None]:
testor.evaluate()

In [None]:
pd.set_option('display.max_rows', 500)