In [None]:
%config Completer.use_jedi=False

In [1]:
import random

import numpy as np
from sklearn.metrics import accuracy_score, top_k_accuracy_score

import torch
import pandas as pd
import tqdm

from transformers import DebertaTokenizerFast, DebertaForSequenceClassification, Trainer, TrainingArguments

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from deberta_datasettmp import TextClassificationDataset
from torch.utils.data import DataLoader

In [2]:
pretrained_model_name = 'microsoft/deberta-v3-base'

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name,
    num_labels=199
)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [3]:
import json
with open("CategoryCode.json", "r")as f:
  Code = json.load(f)

In [24]:
train_label[:10]

[126, 25, 141, 10, 171, 116, 161, 27, 69, 66]

In [4]:

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base', use_fast=False)
valid_ratio = 0.2
batch_size_per_device = 32
n_epochs = 10
warmup_ratio=0.2
max_length=256
num_classes=199

def get_datasets(data, valid_ratio=0.2):
    texts, labels = data['Text'].tolist(), data['MidIdx'].tolist()
    
    shuffeld = list(zip(texts, labels))
    random.shuffle(shuffeld)
  
    texts = [e[0] for e in shuffeld]
    labels = [e[1] for e in shuffeld]
    idx = int(len(texts) * (1 - valid_ratio))
    train_input, train_label = texts[:idx], labels[:idx]
    valid_input, valid_label = texts[idx:], labels[idx:]
    
    return  train_input, train_label, valid_input, valid_label


data = pd.read_csv('./traindata.csv')
train_input, train_label, valid_input, valid_label = get_datasets(data)

train_encodings = tokenizer(train_input, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_input, truncation=True, padding=True, max_length=max_length)
train_labels = torch.tensor(train_label)
valid_labels = torch.tensor(valid_label)

train_set = TextClassificationDataset(train_encodings,train_labels)
test_set = TextClassificationDataset(valid_encodings, valid_labels)

print(
    '|train|', len(train_input),
    '|valid|', len(valid_input),
)

total_batch_size = batch_size_per_device 
n_total_iterations = int(len(train_input) / total_batch_size * n_epochs) 
n_warmup_steps = int(n_total_iterations * warmup_ratio)

print(
    '#total_inters = ', n_total_iterations,
    '#warmup_iters = ', n_warmup_steps,
)



training_args = TrainingArguments(
    output_dir='./Deberta_V3_CheckPoint',
    num_train_epochs=n_epochs,
    per_device_train_batch_size=batch_size_per_device,
    per_device_eval_batch_size=batch_size_per_device,
    warmup_steps=n_warmup_steps,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_steps=n_total_iterations // 100,
    save_steps=n_total_iterations // n_epochs
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    return {
        'accuracy' : top_k_accuracy_score(labels, preds, k=2)
    }


# 이제 CustomTrainer를 사용하여 학습을 수행합니다.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


|train| 185266 |valid| 46317
#total_inters =  57895 #warmup_iters =  11579


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics,
)

trainer.train()

  item["labels"] = torch.tensor(self.labels[idx]).long()


Epoch,Training Loss,Validation Loss


In [23]:
collator = TextClassificationCollator(tokenizer, max_length, with_text=False)

# 데이터셋에서 처음 10개 항목 가져오기
samples_to_test = [train_dataset[i] for i in range(10)]

# Collator에 항목 전달 및 결과 가져오기
collated_samples = collator(samples_to_test)

# 결과 출력
for key, value in collated_samples.items():
    print(f"{key}: {value}")

[{'text': 'Condensed matter "Cluster" reactions in LENRsIn this paper we first point out evidence for condensed matter cluster formation based on thin film electrolysis. Next, measurements of superconductivity in condensed matter deuterium "clusters" in dislocation sites loaded deloaded palladium thin films are briefly reviewed, followed by a discussion of techniques under study to increase the number of such sites per unit volume of the electrodes. Estimates for resulting "cluster reaction" rates flow enhanced Pycnonuclear fusion are given. If successful, this approach offers a "Roadmap" for future power unit based on thin films and clusters.', 'label': 40}, {'text': 'Assessement of kanban use on construction sitesDue to the lack of skilled labour and the difficulties in managing material flows during the execution of site works, construction companies (mainly in the northeast of Brazil) are looking for ways to facilitate communication between management teams and workers and gain con

In [24]:
from torch.utils.data import DataLoader

# 데이터셋과 collator 함수를 이용해 DataLoader 생성
dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size_per_device,
    collate_fn=TextClassificationCollator(tokenizer, max_length, with_text=False),
    shuffle=True
)

# 첫 번째 배치 출력
for batch in dataloader:
    print(batch)
    break

[{'text': "How to measure energy efficiency of software: Metrics and measurement resultsIn the field of information and computer technology (ICT), saving energy has its focus set on energy efficient hardware and its operation. Recently, efforts have also been made in the area of computer software. However, the development of energy efficient software requires metrics, which measure the software's energy consumption as well as models to monitor and minimize it. In software and software development processes they hardly exist. In this work we present a generic metric to measure software and a method to apply it in a software engineering process. © 2012 IEEE.", 'label': 100}, {'text': "Eastern Pulp & Paper gains new credit line, $2 million in fundsEastern Pulp & Paper has received a $2 million infusion of new shareholder loans. The paper company is currently in full operation at its two mills in Lincoln and Brewer. The company boasts of improved financial results owing to the acquisition 

In [6]:
nmr_data = pd.read_csv('/home/ubuntu/JIHO/KBSI/3domain_ab_key/NMR_Keybart_Result_check_sample.csv')
shrimp_data = pd.read_csv('/home/ubuntu/JIHO/KBSI/3domain_ab_key/SHRIMP_Keybart_Result_check_sample.csv')
hvem_data = pd.read_csv('/home/ubuntu/JIHO/KBSI/3domain_ab_key/HVEM_Keybart_Result_check_sample.csv')

In [7]:
def MadeInputData(nmr, shrimp, hvem):
    nmr_list = []
    shrimp_list =[]
    hvem_list =[]
    for k in nmr.iloc:
        nmr_list.append(k['abstract'])
    for k in shrimp.iloc:
        shrimp_list.append(k['abstract'])
    for k in hvem.iloc:
        hvem_list.append(k['abstract'])
        
    return nmr_list, shrimp_list, hvem_list

In [8]:
test_nmr, test_shrimp, test_hvem = MadeInputData(nmr_data, shrimp_data, hvem_data)

In [16]:
nmr_data = tokenizer(test_nmr, padding=True)
hvem_data = tokenizer(test_hvem, padding=True)
shrimp_data = tokenizer(test_shrimp, padding=True)

test_dataset1 = pd.DataFrame({"text":nmr_data})
test_dataset2 = pd.DataFrame({"text":hvem_data})
test_dataset3 = pd.DataFrame({"text":shrimp_data})

In [11]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [17]:
test_dataset1 = SimpleDataset(nmr_data)
test_dataset2 = SimpleDataset(hvem_data)
test_dataset3 = SimpleDataset(shrimp_data)

In [13]:
trainer = Trainer(trainer.model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
predictions1 = trainer.predict(test_dataset1)
predictions2 = trainer.predict(test_dataset2)
predictions3 = trainer.predict(test_dataset3)


***** Running Prediction *****
  Num examples = 161
  Batch size = 24
***** Running Prediction *****
  Num examples = 156
  Batch size = 24
***** Running Prediction *****
  Num examples = 170
  Batch size = 24


In [22]:
len(predictions1.predictions)

161

In [20]:
from tqdm import tqdm

In [30]:
a = np.array([1,2,3])

a = a.tolist()
type(a)

list

In [65]:
nmr_pred, hvem_pred, shrimp_pred = [], [], []

for k in tqdm(range(10)):
    arr = predictions1.predictions[k].tolist()
    tmp_arr = arr[:]
    tmp_list = []
    tmp_arr.sort()

    for k in range(5):
        tmp = tmp_arr.pop()
        tmp_list.append(arr.index(tmp))
    nmr_pred.append(tmp_list)
    #hv
    arr = predictions2.predictions[k].tolist()
    tmp_arr = arr[:]
    tmp_list = []
    tmp_arr.sort()

    for k in range(5):
        tmp = tmp_arr.pop()
        tmp_list.append(arr.index(tmp))
    hvem_pred.append(tmp_list) 
    
    #sh 아 같이 한번에 했어야 됐는데;;
    arr = predictions3.predictions[k].tolist()
    tmp_arr = arr[:]
    tmp_list = []
    tmp_arr.sort()

    for k in range(5):
        tmp = tmp_arr.pop()
        tmp_list.append(arr.index(tmp))
    shrimp_pred.append(tmp_list) 

100%|██████████| 10/10 [00:00<00:00, 7461.85it/s]


In [5]:
torch.save(model.state_dict(), "./weight")

In [58]:
type(list(index_to_label.keys())[1])

int

In [51]:
type(nmr_pred[0][0])

int

In [66]:
def ch(arr):
    tmp_list = []
    for i in arr:
        tmp_list.append(index_to_label[i])
        print(index_to_label[i])
    return tmp_list

In [67]:
nmr_lab , hvem_lab, shrimp_lab =[], [], []
for k in range(10):
    #머리 정지
    nmr_lab.append(ch(nmr_pred[k]))
    hvem_lab.append(ch(hvem_pred[k]))
    shrimp_lab.append(ch(shrimp_pred[k]))
    

전기전자부품
에너지·환경기계시스템
천문학
친환경_공정
염색가공
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
생화학·구조생물학
생화학
융합화학
분류·생태·환경생물학
의생명과학
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
농화학
원예작물과학
산업바이오
융합화학
유기화학
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
면역학·생리학
분자세포생물학
의생명과학
생화학
생화학·구조생물학
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
생화학·구조생물학
융합화학
생화학
정밀화학
원자·분자물리
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
생화학·구조생물학
융합화학
원자·분자물리
융합바이오
생화학
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
면역학·생리학
분석·물성평가기술
물리화학
통계물리
고분자공정기술
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
물리화학
융합바이오
융합화학
분석화학
면역학·생리학
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
해양생명
물리화학
면역학·생리학
어업생산·이용가공
농화학
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학
물리화학
생화학·구조생물학
융합화학
원자·분자물리
산업바이오
광화학
금속재료
나노화학
반도체장비
나노화학공정기술
지구화학
지질과학
온실가스처리
뇌의약
물리화학


In [71]:
Result1 = pd.DataFrame({"abstarct": test_nmr[:10], "classification": nmr_lab})
Result2 = pd.DataFrame({"abstarct": test_hvem[:10], "classification": hvem_lab})
Result3 = pd.DataFrame({"abstarct": test_shrimp[:10], "classification": shrimp_lab})

In [72]:
writer = pd.ExcelWriter("Classification_debertaV3.xlsx", engine="xlsxwriter")
Result1.to_excel(writer, sheet_name="NMR")
Result2.to_excel(writer, sheet_name="HVEM")
Result3.to_excel(writer, sheet_name="SHRIMP")

In [75]:
writer.save()