## DeBERTa

In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')
train_data=pd.read_csv("yahoo_train.csv",header=0,names=['categories', 'text'])
test_data=pd.read_csv("yahoo_test.csv",header=0,names=['categories', 'text'])
#train_data['classid'] = train_data['classid'].map({3:0,4:1})
#test_data['classid'] = test_data['classid'].map({3:0,4:1})
train_data['categories'] = train_data['categories'].map(dict(zip(range(1, 11), range(10))))
test_data['categories'] = test_data['categories'].map(dict(zip(range(1, 11), range(10))))
train_data['text'] = train_data['text'].values.astype('str')
test_data['text'] = test_data['text'].values.astype('str')
train_data.head()

Unnamed: 0,categories,text
0,0,Good question.
1,0,for religious or philosophical matter yes. no...
2,0,My stomach growled so loud once in Church that...
3,0,"The same reasons women do. \n\nthe ""game""...i..."
4,0,That is my hope. Unless we examine the mistak...


In [2]:
print(train_data['categories'].unique())
print(test_data['categories'].unique())

[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]


In [3]:
train_data['categories'].value_counts()

0    400
1    400
2    400
3    400
4    400
5    400
6    400
7    400
8    400
9    400
Name: categories, dtype: int64

In [4]:
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
#from transformers import RobertaTokenizerFast
#tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [5]:
def encode_data(dataframe,tokenizer,max_seq_length=64):
    inputs = list(dataframe['text'])
    encoded = tokenizer(inputs,max_length=max_seq_length,truncation=True,padding="max_length",return_tensors="pt")
    return encoded

def extract_labels(dataframe):
    return list(dataframe['categories'])

def model_init():
    model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base',num_labels=10)
    return model

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    from sklearn.metrics import accuracy_score, normalized_mutual_info_score, adjusted_rand_score
    accurcay = accuracy_score(labels,preds)
    NMI = normalized_mutual_info_score(labels,preds)
    ARI = adjusted_rand_score(labels,preds)
    return {'eval_accuracy':accurcay,'eval_NMI':NMI,'eval_ARI':ARI}

In [6]:
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support,log_loss
# accuracy_score([0,1,2,3,0,1,2,3],[0,1,2,3,1,1,1,1])
# log_loss([0,1,2,3,0,1,2,3],[0,1,2,3,1,1,1,1])

In [7]:
class CreateDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_length=64):
        self.encoded_data = encode_data(dataframe,tokenizer,max_seq_length)
        self.label_list = extract_labels(dataframe)

    def __len__(self): 
        return len(self.label_list)

    def __getitem__(self, i):
        item_i = {}
        item_i['input_ids'] = self.encoded_data['input_ids'][i]
        item_i['attention_mask'] = self.encoded_data['attention_mask'][i]
        item_i['labels'] = self.label_list[i]
        
        return item_i

In [8]:
#split train_data into training and val data
training_data = train_data.sample(frac=0.8, random_state=8521)
val_data = train_data.drop(training_data.index)

train_data_deberta = CreateDataset(training_data, tokenizer)
val_data_deberta = CreateDataset(val_data, tokenizer)
test_data_deberta = CreateDataset(test_data, tokenizer)
train_data_deberta[0]

{'input_ids': tensor([    1,  8585,    32,   171, 37457,   282, 37457,   282,  8863,   448,
         37457,   282,   250,  7742, 37457,   282, 24699, 18611, 37457,   282,
          7331, 37457,   282,  2336,  3048, 37457,   282,   574,  2796, 44320,
         37457,   282,   565,  3196, 15473,  3813, 37457,   282, 36693,   863,
          2068, 10466, 44128,   282,  6850,  5330, 12743, 37457,   282,  3721,
         12946, 37457,   282,  2118,   100,  2796, 47701, 37457,   282, 21134,
         43896, 26624, 37457,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': 3}

In [9]:
training_args = TrainingArguments(
    output_dir="./models_yahoo/",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=10, 
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    logging_steps=100,  #每n步更新一次参数,根据数据量调整
    logging_first_step=True,
    save_steps=960, #每20步储存一次参数,根据数据量调整,存一次就可以了
    evaluation_strategy = "epoch", # evaluate at the end of every epoch
    logging_dir="./logs_yahoo/",
    learning_rate=1e-5, #config
    weight_decay=0.01,
)

In [12]:
trainer = Trainer(args = training_args,
                  train_dataset=train_data_deberta,
                  eval_dataset=val_data_deberta,
                  tokenizer=tokenizer,
                  model_init = model_init,
                  compute_metrics = compute_metrics,)

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /Users/fengwenxin/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions

In [None]:
from ray import tune
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.suggest.basic_variant import BasicVariantGenerator

In [None]:
tune_config = {"learning_rate": tune.uniform(1e-5, 5e-5)} 

best_results = trainer.hyperparameter_search(
    hp_space = lambda _:tune_config,
    backend = 'ray',
    compute_objective = lambda metrics: metrics["eval_ARI"],
    mode = 'max',
    search_alg = BasicVariantGenerator(),
    n_trials=3, 
)

print(best_results)

In [14]:
trainer.train()
trainer.evaluate()

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /Users/fengwenxin/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions

Epoch,Training Loss,Validation Loss,Accuracy,Nmi,Ari
1,1.4395,1.358159,0.57875,0.384189,0.301444
2,1.1914,1.282262,0.58875,0.388927,0.311402
3,0.9985,1.272746,0.5925,0.391903,0.319266


***** Running Evaluation *****
  Num examples = 800
  Batch size = 5
***** Running Evaluation *****
  Num examples = 800
  Batch size = 5
Saving model checkpoint to ./models_yahoo/checkpoint-960
Configuration saved in ./models_yahoo/checkpoint-960/config.json
Model weights saved in ./models_yahoo/checkpoint-960/pytorch_model.bin
tokenizer config file saved in ./models_yahoo/checkpoint-960/tokenizer_config.json
Special tokens file saved in ./models_yahoo/checkpoint-960/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 800
  Batch size = 5


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 800
  Batch size = 5


{'eval_accuracy': 0.5925,
 'eval_NMI': 0.39190288617485936,
 'eval_ARI': 0.31926614615283705,
 'eval_loss': 1.2727457284927368,
 'eval_runtime': 123.566,
 'eval_samples_per_second': 6.474,
 'eval_steps_per_second': 1.295,
 'epoch': 3.0}

#### 2. Clustering with Fine-tuned Electra and pre-trained Electra

In [10]:
model_finetune = DebertaForSequenceClassification.from_pretrained("./models_yahoo/checkpoint-960",num_labels=10,output_hidden_states=True)
model_pretrain = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base",num_labels=10,output_hidden_states=True)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.den

In [None]:
# inputs = tokenizer("Hello, my dog is cute", max_length=64,padding="max_length",return_tensors="pt")
# outputs = model(**inputs)
# #get the text representation from the last hidden layer
# length = np.array(inputs['attention_mask'][0]).sum()
# encoding = outputs.hidden_states[-1][0].detach().numpy()[:length,:]
# encoding = encoding.mean(axis=0)

In [1]:
from tqdm import tqdm

In [12]:
true_label = test_data['categories']

def text_representation(dataframe,model,tokenizer):
    representation = []
    for i in tqdm(range(len(dataframe))):
        text = dataframe.iloc[i]['text']
        inputs = tokenizer(text, max_length=64,padding="max_length",return_tensors="pt")
        outputs = model(**inputs)
        length = np.array(inputs['attention_mask'][0]).sum()
        encoding = outputs.hidden_states[-1][0].detach().numpy()[:length,:]
        encoding = list(encoding.mean(axis=0))
        representation.append(encoding)
    return np.array(representation)

# pretrain_train = text_representation(train_data,model_pretrain,tokenizer)
# pretrain_test = text_representation(test_data,model_pretrain,tokenizer)
# finetune_train = text_representation(train_data,model_finetune,tokenizer)
# finetune_test = text_representation(test_data,model_finetune,tokenizer)

In [13]:
pretrain_train = text_representation(train_data,model_pretrain,tokenizer)

100%|███████████████████████████████████████| 4000/4000 [30:04<00:00,  2.22it/s]


In [14]:
pretrain_test = text_representation(test_data,model_pretrain,tokenizer)

100%|███████████████████████████████████████| 1000/1000 [07:02<00:00,  2.37it/s]


In [15]:
finetune_train = text_representation(train_data,model_finetune,tokenizer)

100%|███████████████████████████████████████| 4000/4000 [29:37<00:00,  2.25it/s]


In [16]:
finetune_test = text_representation(test_data,model_finetune,tokenizer)

100%|███████████████████████████████████████| 1000/1000 [07:06<00:00,  2.35it/s]


In [17]:
pretrain_test.shape

(1000, 768)

In [18]:
def three_metrics(true_label,preds):  
    from sklearn.metrics import accuracy_score, normalized_mutual_info_score, adjusted_rand_score
    ACC = round(accuracy_score(true_label,preds),3)
    if ACC<=1/len(true_label.unique()): #说明聚类的label，和真实的label没对上
        keys = list(pd.value_counts(preds).index)
        values = list(pd.value_counts(true_label).index)
        dic = dict(zip(keys, values))
        preds = pd.Series(preds).map(dic)
    NMI = round(normalized_mutual_info_score(true_label,preds),3)
    ARI = round(adjusted_rand_score(true_label,preds),3)
    ACC = round(accuracy_score(true_label,preds),3)
    return {'ACC':ACC,'NMI':NMI,'ARI':ARI}

In [None]:
#K-Means
from sklearn.cluster import KMeans
clustering_model = KMeans(n_clusters = 10, 
                          init = 'k-means++',
                          max_iter = 300, n_init = 10,random_state=8521)
clustering_model.fit(pretrain_train)
pretrain_KMeans = clustering_model.predict(pretrain_test)
clustering_model.fit(finetune_train)
finetune_KMeans = clustering_model.predict(finetune_test)
print('pretrained text representation:',three_metrics(true_label,pretrain_KMeans))
print('finetuned text representation:',three_metrics(true_label,finetune_KMeans))

In [None]:
#FCM
from fcmeans import FCM
fcm = FCM(n_clusters=10)
fcm.fit(np.array(pretrain_train))
pretrain_FCM = fcm.predict(pretrain_test)
fcm.fit(np.array(finetune_train))
finetune_FCM = fcm.predict(finetune_test)
print('pretrained text representation:',three_metrics(true_label,pretrain_FCM))
print('finetuned text representation:',three_metrics(true_label,finetune_FCM))

In [None]:
#LDA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
pretrain_train2 = scaler.fit_transform(pretrain_train)
pretrain_test2 = scaler.fit_transform(pretrain_test)
finetune_train2 = scaler.fit_transform(finetune_train)
finetune_test2 = scaler.fit_transform(finetune_test)

lda = LatentDirichletAllocation(n_components=10, random_state=456)
lda.fit(pretrain_train2)
doc_topic_dist_unnormalized = np.matrix(lda.transform(pretrain_test2))
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
pretrain_LDA = list(np.array(doc_topic_dist.argmax(axis=1)).T[0])
lda.fit(finetune_train2)
doc_topic_dist_unnormalized = np.matrix(lda.transform(finetune_test2))
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
finetune_LDA = list(np.array(doc_topic_dist.argmax(axis=1)).T[0])
print('pretrained text representation:',three_metrics(true_label,pretrain_LDA))
print('finetuned text representation:',three_metrics(true_label,finetune_LDA))

In [None]:
#model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=4,output_hidden_states=True)

In [None]:
# inputs = tokenizer("Hello, my dog is cute", max_length=10,padding="max_length",return_tensors="pt")
# outputs = model(**inputs)
# outputs.logits