In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data=pd.read_csv("R2_train.csv",header=0,names=['id','categories','text'])
test_data=pd.read_csv("R2_test.csv",header=0,names=['id','categories','text'])
train_data['categories'] = train_data['categories'].map({'earn':0,'acq':1})  #测试的类型只有两类，完整的数据不只两类，跑HPC前需要修改一下
test_data['categories'] = test_data['categories'].map({'earn':0,'acq':1})  #测试的类型只有两类，完整的数据不只两类，跑HPC前需要修改一下
#train_data.head()

In [3]:
train_data.head()

Unnamed: 0,id,categories,text
0,0,1,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...
1,1,1,NATIONAL AMUSEMENTS AGAIN UPS VIACOM &lt;VIA> ...
2,2,0,ROGERS &lt;ROG> SEES 1ST QTR NET UP SIGNIFICAN...
3,3,0,ISLAND TELEPHONE SHARE SPLIT APPROVED\n &lt;I...
4,4,1,U.K. GROWING IMPATIENT WITH JAPAN - THATCHER\n...


In [4]:
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

In [5]:
def encode_data(dataframe,tokenizer,max_seq_length=64):
    inputs = list(dataframe['text'])
    encoded = tokenizer(inputs,max_length=max_seq_length,truncation=True,padding="max_length",return_tensors="pt")
    return encoded

def extract_labels(dataframe):
    return list(dataframe['categories'])

def model_init():
    model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base")
    return model

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support,log_loss
    accurcay = accuracy_score(labels,preds)
    loss = log_loss(labels,preds)
    metrics = precision_recall_fscore_support(labels,preds,average='binary')
    precision = metrics[0]
    recall = metrics[1]
    f1 = metrics[2]
    return {'eval_accuracy':accurcay,'eval_precision':precision,
            'eval_recall':recall,'eval_f1':f1,'eval_loss':loss}

In [6]:
class CreateDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_length=64):
        self.encoded_data = encode_data(dataframe,tokenizer,max_seq_length)
        self.label_list = extract_labels(dataframe)

    def __len__(self): 
        return len(self.label_list)

    def __getitem__(self, i):
        item_i = {}
        item_i['input_ids'] = self.encoded_data['input_ids'][i]
        item_i['attention_mask'] = self.encoded_data['attention_mask'][i]
        item_i['labels'] = self.label_list[i]
        
        return item_i

In [7]:
#split train_data into training and val data
training_data = train_data.sample(frac=0.8, random_state=8521)
val_data = train_data.drop(training_data.index)

train_data_deberta = CreateDataset(training_data, tokenizer)
val_data_deberta = CreateDataset(val_data, tokenizer)
test_data_deberta = CreateDataset(test_data, tokenizer)
train_data_deberta[0]

{'input_ids': tensor([    1,   495,  2571,  2620,  3450, 18012,   359,  7984,   131, 25652,
          3450, 15698,   112,  4014,  1209,  6997, 16718,  1105,   226, 17549,
         50118,  1437,  5008,   872,  8403,     6, 39196,  1954,  1963,   365,
             6, 27434, 50118,  1437,  1437,  1437,  1437,  1437,  4706,  8060,
             6, 37932,  1954, 29304,     6, 34067, 50118,  1437,  1437,  1437,
          1437,  1437,  6068,    35,   228,   458,   414,    45,   577,     6,
            25,   138,   439,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': 0}

In [8]:
training_args = TrainingArguments(
    output_dir="./models/",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=10, 
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    logging_steps=2,  #每n步更新一次参数,根据数据量调整
    logging_first_step=True,
    save_steps=24, #每20步储存一次参数,根据数据量调整,存一次就可以了
    evaluation_strategy = "epoch", # evaluate at the end of every epoch
    logging_dir="./logs/",
    learning_rate=1e-5, #config
    weight_decay=0.01,
)

In [9]:
trainer = Trainer(args = training_args,
                  train_dataset=train_data_deberta,
                  eval_dataset=val_data_deberta,
                  tokenizer=tokenizer,
                  model_init = model_init,
                  compute_metrics = compute_metrics,)

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /Users/fengwenxin/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.17.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading weights file https://hu

In [10]:
from ray import tune
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.suggest.basic_variant import BasicVariantGenerator

In [11]:
tune_config = {"learning_rate": tune.uniform(1e-5, 5e-5)} 

best_results = trainer.hyperparameter_search(
    hp_space = lambda _:tune_config,
    backend = 'ray',
    compute_objective = lambda metrics: metrics["eval_loss"],
    mode = 'min',
    search_alg = BasicVariantGenerator(),
    n_trials=3, 
)

print(best_results)

No `resources_per_trial` arg was passed into `hyperparameter_search`. Setting it to a default value of 1 CPU for each trial.


KeyboardInterrupt: 

In [12]:
training_args = TrainingArguments(
    output_dir="./models/",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=10, 
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    logging_steps=100,  #每n步更新一次参数,根据数据量调整
    logging_first_step=True,
    save_steps=500, #每20步储存一次参数,根据数据量调整,存一次就可以了
    evaluation_strategy = "epoch", # evaluate at the end of every epoch
    logging_dir="./logs/",
    learning_rate=1e-5, #config
    weight_decay=0.01,
)

trainer = Trainer(args = training_args,
                  train_dataset=train_data_deberta,
                  eval_dataset=val_data_deberta,
                  tokenizer=tokenizer,
                  model_init = model_init,
                  compute_metrics = compute_metrics,)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /Users/fengwenxin/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dro

In [13]:
trainer.train()
trainer.evaluate()

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /Users/fengwenxin/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.17.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading weights file https://hu

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1101,0.117901,0.967956,0.923304,0.990506,0.955725
2,0.0587,0.070884,0.98453,0.971875,0.984177,0.977987
3,0.0533,0.070265,0.985635,0.974922,0.984177,0.979528


***** Running Evaluation *****
  Num examples = 905
  Batch size = 5
Saving model checkpoint to ./models/checkpoint-500
Configuration saved in ./models/checkpoint-500/config.json
Model weights saved in ./models/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 905
  Batch size = 5
Saving model checkpoint to ./models/checkpoint-1000
Configuration saved in ./models/checkpoint-1000/config.json
Model weights saved in ./models/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 905
  Batch size = 5


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num example

{'eval_accuracy': 0.9856353591160221,
 'eval_precision': 0.9749216300940439,
 'eval_recall': 0.9841772151898734,
 'eval_f1': 0.9795275590551181,
 'eval_loss': 0.07026508450508118,
 'eval_runtime': 132.1165,
 'eval_samples_per_second': 6.85,
 'eval_steps_per_second': 1.37,
 'epoch': 3.0}

In [14]:
model_finetune = DebertaForSequenceClassification.from_pretrained("./models/checkpoint-1000",output_hidden_states=True)
model_pretrain = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base",output_hidden_states=True)

loading configuration file ./models/checkpoint-1000/config.json
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "architectures": [
    "DebertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading weights file ./models/checkpoint-1000/pytorch_model.bin
All model checkpoint weig

In [23]:
from tqdm import tqdm

In [24]:
# test_data=pd.read_csv("R2_test.csv",header=0,names=['classid','title','desc'], nrows = 10)
true_label = test_data['categories']

def text_representation(dataframe,model,tokenizer):
    representation = []
    for i in tqdm(range(len(dataframe))):
        text = dataframe.iloc[i]['text']
        inputs = tokenizer(text, max_length=64,padding="max_length",return_tensors="pt")
        outputs = model(**inputs)
        length = np.array(inputs['attention_mask'][0]).sum()
        encoding = outputs.hidden_states[-1][0].detach().numpy()[:length,:]
        encoding = list(encoding.mean(axis=0))
        representation.append(encoding)
    return np.array(representation)

In [26]:
pretrain_train = text_representation(train_data,model_pretrain,tokenizer)

100%|█████████████████████████████████████| 4527/4527 [1:02:57<00:00,  1.20it/s]


In [25]:
pretrain_test = text_representation(test_data,model_pretrain,tokenizer)

100%|███████████████████████████████████████| 1806/1806 [23:33<00:00,  1.28it/s]


In [27]:
finetune_train = text_representation(train_data,model_finetune,tokenizer)

100%|█████████████████████████████████████| 4527/4527 [1:04:50<00:00,  1.16it/s]


In [28]:
finetune_test = text_representation(test_data,model_finetune,tokenizer)

100%|███████████████████████████████████████| 1806/1806 [24:06<00:00,  1.25it/s]


In [29]:
pretrain_test.shape

(1806, 768)

In [30]:
def three_metrics(true_label,preds):  
    from sklearn.metrics import accuracy_score, normalized_mutual_info_score, adjusted_rand_score
    ACC = round(accuracy_score(true_label,preds),3)
    if ACC<=1/len(true_label.unique()): #说明：聚类的label，和真实的label没对上（改好了，不需要调整）
        keys = list(pd.value_counts(preds).index)
        values = list(pd.value_counts(true_label).index)
        dic = dict(zip(keys, values))
        preds = pd.Series(preds).map(dic)
    NMI = round(normalized_mutual_info_score(true_label,preds),3)
    ARI = round(adjusted_rand_score(true_label,preds),3)
    ACC = round(accuracy_score(true_label,preds),3)
    return {'ACC':ACC,'NMI':NMI,'ARI':ARI}

In [31]:
#K-Means
from sklearn.cluster import KMeans
clustering_model = KMeans(n_clusters = 2, 
                          init = 'k-means++',
                          max_iter = 300, n_init = 10,random_state=123)
clustering_model.fit(pretrain_train)
pretrain_KMeans = clustering_model.predict(pretrain_test)
clustering_model.fit(finetune_train)
finetune_KMeans = clustering_model.predict(finetune_test)
print('pretrained text representation:',three_metrics(true_label,pretrain_KMeans))
print('finetuned text representation:',three_metrics(true_label,finetune_KMeans))

pretrained text representation: {'ACC': 0.886, 'NMI': 0.487, 'ARI': 0.597}
finetuned text representation: {'ACC': 0.984, 'NMI': 0.878, 'ARI': 0.937}


In [32]:
#FCM
from fcmeans import FCM
fcm = FCM(n_clusters=2)
fcm.fit(np.array(pretrain_train))
pretrain_FCM = fcm.predict(pretrain_test)
fcm.fit(np.array(finetune_train))
finetune_FCM = fcm.predict(finetune_test)
print('pretrained text representation:',three_metrics(true_label,pretrain_FCM))
print('finetuned text representation:',three_metrics(true_label,finetune_FCM))

pretrained text representation: {'ACC': 0.919, 'NMI': 0.624, 'ARI': 0.703}
finetuned text representation: {'ACC': 0.984, 'NMI': 0.878, 'ARI': 0.937}


In [33]:
#LDA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
pretrain_train2 = scaler.fit_transform(pretrain_train)
pretrain_test2 = scaler.fit_transform(pretrain_test)
finetune_train2 = scaler.fit_transform(finetune_train)
finetune_test2 = scaler.fit_transform(finetune_test)

lda = LatentDirichletAllocation(n_components=2, random_state=456)
lda.fit(pretrain_train2)
doc_topic_dist_unnormalized = np.matrix(lda.transform(pretrain_test2))
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
pretrain_LDA = list(np.array(doc_topic_dist.argmax(axis=1)).T[0])
lda.fit(finetune_train2)
doc_topic_dist_unnormalized = np.matrix(lda.transform(finetune_test2))
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
finetune_LDA = list(np.array(doc_topic_dist.argmax(axis=1)).T[0])
print('pretrained text representation:',three_metrics(true_label,pretrain_LDA))
print('finetuned text representation:',three_metrics(true_label,finetune_LDA))

pretrained text representation: {'ACC': 0.914, 'NMI': 0.598, 'ARI': 0.686}
finetuned text representation: {'ACC': 0.984, 'NMI': 0.882, 'ARI': 0.939}
