## **Import Lib**



In [1]:
%%capture
!pip install datasets

In [2]:
import os
import ast
import json
import pandas as pd
from datasets import load_dataset
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir("/content/drive/MyDrive/2_PBL7/")

## **Init**

In [None]:
dataset_names = ['inspec', 'se-2010', 'se-2017', 'kp20k']

## **Load Data**

- input: abstract
- label: danh sách các keyword

Nguồn data:
- Inspec: https://huggingface.co/datasets/midas/inspec?row=0
- SE-2010: https://huggingface.co/datasets/taln-ls2n/semeval-2010-pre?row=0
- SE-2017: https://huggingface.co/datasets/midas/semeval2017

Hoặc sử dụng kp20k: gồm 530809 rows dữ liệu

### KP20K

In [6]:
# !unzip "./Phraserformer/data.zip" -d "./Phraserformer"

In [7]:
# Đường dẫn tới thư mục chứa các file JSON
folder_path = "./Phraserformer/data/dataset/kp20k"

# Khởi tạo một danh sách để lưu trữ dữ liệu từ các file JSON
data_list = []

# Lặp qua tất cả các file trong thư mục
for filename in os.listdir(folder_path):
    # Kiểm tra nếu file có đuôi là ".json"
    if filename.endswith(".json"):
        # Đường dẫn đến file JSON
        file_path = os.path.join(folder_path, filename)
        # Mở file và load nội dung
        with open(file_path, "r") as file:
            # Thêm dữ liệu từ file vào danh sách
            data_list.extend([json.loads(line) for line in file])

In [8]:
print(len(data_list))

570809


In [9]:
print(data_list[0:10])

[{'abstract': 'This paper proposes using virtual reality to enhance the perception of actions by distant users on a shared application. Here, distance may refer either to space ( e.g. in a remote synchronous collaboration) or time ( e.g. during playback of recorded actions). Our approach consists in immersing the application in a virtual inhabited 3D space and mimicking user actions by animating avatars. We illustrate this approach with two applications, the one for remote collaboration on a shared application and the other to playback recorded sequences of user actions. We suggest this could be a low cost enhancement for telepresence.', 'keyword': 'telepresence;animation;avatars;application sharing;collaborative virtual environments', 'title': 'virtually enhancing the perception of user actions'}, {'abstract': 'This paper presents an improved architecture of the multistage multibit sigma-delta modulators (EAMs) for wide-band applications. Our approach is based on two resonator topolog

In [10]:
# Creating DataFrame from list of dictionaries
kp20k_df = pd.DataFrame(data_list)

In [11]:
kp20k_df['keyword'] = kp20k_df['keyword'].apply(lambda x: x.split(";"))

In [12]:
# Displaying the DataFrame
kp20k_df.head(5)

Unnamed: 0,abstract,keyword,title
0,This paper proposes using virtual reality to e...,"[telepresence, animation, avatars, application...",virtually enhancing the perception of user act...
1,This paper presents an improved architecture o...,"[sigma delta modulators, analog-to-digital con...",Dynamic range improvement of multistage multib...
2,"In this paper, we discuss the motivation and t...",[enterprise information integration and intero...,An ontology modelling perspective on business ...
3,An overview of the self-organizing map algorit...,"[self-organizing map, learning vector quantiza...",The self-organizing map
4,The amygdala comprises part of an extended net...,"[social brain, amygdala, behavior, facial expr...",The Amygdala and Development of the Social Brain


### Inspec

In [13]:
inspec_ds = load_dataset("midas/inspec")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/25.4k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [14]:
print(inspec_ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'doc_bio_tags'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'document', 'doc_bio_tags'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['id', 'document', 'doc_bio_tags'],
        num_rows: 500
    })
})


In [15]:
train_inspec_df = pd.DataFrame(inspec_ds['train'])
validation_inspec_df = pd.DataFrame(inspec_ds['validation'])
test_inspec_df = pd.DataFrame(inspec_ds['test'])

In [16]:
all_inspec_df = pd.concat([train_inspec_df, validation_inspec_df, test_inspec_df], ignore_index=True)

In [17]:
print(all_inspec_df)

        id                                           document  \
0     1001  [A, conflict, between, language, and, atomisti...   
1     1002  [Selective, representing, and, world-making, W...   
2     1000  [Does, classicism, explain, universality, ?, A...   
3      100  [Separate, accounts, go, mainstream, -LSB-, in...   
4     1012  [Evolving, receptive-field, controllers, for, ...   
...    ...                                                ...   
1995   402  [Fast, frequency, acquisition, phase-frequency...   
1996   392  [Time-varying, properties, of, renal, autoregu...   
1997   384  [Brightness-independent, start-up, routine, fo...   
1998    37  [Design, PID, controllers, for, desired, time-...   
1999   411  [CAD/CAE, software, aids, converter, design, -...   

                                           doc_bio_tags  
0     [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
1     [B, I, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
2     [O, B, O, B, O, O, O, O, O, B, I, I, I,

In [18]:
def keyword_list(x):
  keywords = set()
  i = 0
  while i<len(x['doc_bio_tags']):
    if x['doc_bio_tags'][i]=='B':
      keyword = [x['document'][i]]
      while (i+1<len(x['doc_bio_tags']) and x['doc_bio_tags'][i+1]=='I'):
        keyword.append(x['document'][i+1])
        i+=1
      keyword = ' '.join(keyword)
      keywords.add(keyword)
    i+=1
  return list(keywords)
keyword_list(all_inspec_df.loc[0])

['Content Atomism',
 'philosophy of mind',
 'IBS',
 'cognitive states',
 'LOT',
 'beliefs',
 'desires',
 'Language of Thought']

In [19]:
# Tạo dataframe mới
inspec_df = pd.DataFrame()

# Cột abstract là ghép các phần tử trong document thành chuỗi
inspec_df['abstract'] = all_inspec_df['document'].apply(' '.join)

# Cột keyword lấy các từ có doc_bio_tags = 'B'
inspec_df['keyword'] = all_inspec_df.apply(lambda x: keyword_list(x), axis=1)



In [20]:
print(inspec_df.iloc[0]['abstract'])
print(inspec_df.iloc[0]['keyword'])

A conflict between language and atomistic information Fred Dretske and Jerry Fodor are responsible for popularizing three well-known theses in contemporary philosophy of mind : the thesis of Information-Based Semantics -LRB- IBS -RRB- , the thesis of Content Atomism -LRB- Atomism -RRB- and the thesis of the Language of Thought -LRB- LOT -RRB- . LOT concerns the semantically relevant structure of representations involved in cognitive states such as beliefs and desires . It maintains that all such representations must have syntactic structures mirroring the structure of their contents . IBS is a thesis about the nature of the relations that connect cognitive representations and their parts to their contents -LRB- semantic relations -RRB- . It holds that these relations supervene solely on relations of the kind that support information content , perhaps with some help from logical principles of combination . Atomism is a thesis about the nature of the content of simple symbols . It holds 

In [21]:
inspec_df.head(5)

Unnamed: 0,abstract,keyword
0,A conflict between language and atomistic info...,"[Content Atomism, philosophy of mind, IBS, cog..."
1,Selective representing and world-making We dis...,"[realism, Selective representing, selective re..."
2,Does classicism explain universality ? Argumen...,"[human cognition, connectionist models, univer..."
3,Separate accounts go mainstream -LSB- investme...,"[independent money managers, investment]"
4,Evolving receptive-field controllers for mobil...,"[nonlinear interactions, evolutionary methods,..."


### SE-2010

In [22]:
se2010_ds = load_dataset("taln-ls2n/semeval-2010-pre")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/7.53k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [23]:
print(se2010_ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'keyphrases', 'prmu', 'lvl-1', 'lvl-2', 'lvl-3', 'lvl-4'],
        num_rows: 144
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'keyphrases', 'prmu', 'lvl-1', 'lvl-2', 'lvl-3', 'lvl-4'],
        num_rows: 100
    })
})


In [24]:
train_se2010_df = pd.DataFrame(se2010_ds['train'])
test_se2010_df = pd.DataFrame(se2010_ds['test'])

In [25]:
all_se2010_df = pd.concat([train_se2010_df, test_se2010_df], ignore_index=True)

In [26]:
all_se2010_df.head(5)

Unnamed: 0,id,title,abstract,keyphrases,prmu,lvl-1,lvl-2,lvl-3,lvl-4
0,J-39,The Sequential Auction Problem on eBay: An Emp...,Bidders on eBay have no dominant bidding strat...,"[sequenti auction problem, empir analysi, bid ...","[P, P, P, P, P, P, P, P, U, M, M, U, U, M, U, ...",The Sequential Auction Problem on eBay: An Emp...,The Sequential Auction Problem on eBay: An Emp...,The Sequential Auction Problem on eBay: An Emp...,The Sequential Auction Problem on eBay: An Emp...
1,I-54,Approximate and Online Multi-Issue Negotiation,This paper analyzes bilateral multi-issue nego...,"[approxim, negoti, time constraint, equilibriu...","[P, P, P, P, P, P, M, U, M, U, U, R, U, R]",Approximate and Online Multi-Issue Negotiation...,Approximate and Online Multi-Issue Negotiation...,Approximate and Online Multi-Issue Negotiation...,Approximate and Online Multi-Issue Negotiation...
2,I-68,On Opportunistic Techniques for Solving Decent...,Decentralized Markov Decision Processes (DEC-M...,"[decentr markov decis process, decentr markov ...","[P, P, P, P, P, P, M, M, U, U, U, M, U, M, U, M]",On Opportunistic Techniques for Solving Decent...,On Opportunistic Techniques for Solving Decent...,On Opportunistic Techniques for Solving Decent...,On Opportunistic Techniques for Solving Decent...
3,I-55,Searching for Joint Gains in Automated Negotia...,It is well established by conflict theorists a...,"[autom negoti, negoti, creat valu, claim valu,...","[P, P, P, P, P, U, U, U, U, U, M, U, M, M]",Searching for Joint Gains in Automated Negotia...,Searching for Joint Gains in Automated Negotia...,Searching for Joint Gains in Automated Negotia...,Searching for Joint Gains in Automated Negotia...
4,J-38,Multi-Attribute Coalitional Games,We study coalitional games where the value of ...,"[multi-attribut coalit game, coalit game, coop...","[P, P, P, P, P, P, P, M, U, U, U, M, U, U, M, ...",Multi-Attribute Coalitional Games∗ Samuel Ieon...,Multi-Attribute Coalitional Games * t\nABSTRAC...,Multi-Attribute Coalitional Games * t\nABSTRAC...,Multi-Attribute Coalitional Games * t\nABSTRAC...


In [27]:
se2010_df = all_se2010_df[['title', 'abstract', 'keyphrases']]

In [28]:
se2010_df = se2010_df.rename(columns={'keyphrases': 'keyword'})

In [29]:
se2010_df.head(5)

Unnamed: 0,title,abstract,keyword
0,The Sequential Auction Problem on eBay: An Emp...,Bidders on eBay have no dominant bidding strat...,"[sequenti auction problem, empir analysi, bid ..."
1,Approximate and Online Multi-Issue Negotiation,This paper analyzes bilateral multi-issue nego...,"[approxim, negoti, time constraint, equilibriu..."
2,On Opportunistic Techniques for Solving Decent...,Decentralized Markov Decision Processes (DEC-M...,"[decentr markov decis process, decentr markov ..."
3,Searching for Joint Gains in Automated Negotia...,It is well established by conflict theorists a...,"[autom negoti, negoti, creat valu, claim valu,..."
4,Multi-Attribute Coalitional Games,We study coalitional games where the value of ...,"[multi-attribut coalit game, coalit game, coop..."


In [30]:
print(se2010_df.iloc[0]['title'])
print(se2010_df.iloc[0]['abstract'])
print(se2010_df.iloc[0]['keyword'])

The Sequential Auction Problem on eBay: An Empirical Analysis and a Solution
Bidders on eBay have no dominant bidding strategy when faced with multiple auctions each offering an item of interest. As seen through an analysis of 1,956 auctions on eBay for a Dell E193FP LCD monitor, some bidders win auctions at prices higher than those of other available auctions, while others never win an auction despite placing bids in losing efforts that are greater than the closing prices of other available auctions. These misqueues in strategic behavior hamper the efficiency of the system, and in so doing limit the revenue potential for sellers. This paper proposes a novel options-based extension to eBay's proxy-bidding system that resolves this strategic issue for buyers in commoditized markets. An empirical analysis of eBay provides a basis for computer simulations that investigate the market effects of the options-based scheme, and demonstrates that the options-based scheme provides greater effici

### SE-2017

In [31]:
se2017_ds = load_dataset("midas/semeval2017")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/6.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/337k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/175k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [32]:
print(se2017_ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'doc_bio_tags'],
        num_rows: 350
    })
    test: Dataset({
        features: ['id', 'document', 'doc_bio_tags'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'document', 'doc_bio_tags'],
        num_rows: 50
    })
})


In [33]:
train_se2017_df = pd.DataFrame(se2017_ds['train'])
validation_se2017_df = pd.DataFrame(se2017_ds['validation'])
test_se2017_df = pd.DataFrame(se2017_ds['test'])

In [34]:
all_se2017_df = pd.concat([train_se2017_df, validation_se2017_df, test_se2017_df], ignore_index=True)

In [35]:
all_se2017_df.head(5)

Unnamed: 0,id,document,doc_bio_tags
0,S0370269304007208,"[It, is, well, known, that, one, of, the, long...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, ..."
1,S0032386109006612,"[In, contrast, with, polymers,, which, are, ty...","[O, O, O, B, O, O, O, B, I, I, I, I, B, O, B, ..."
2,S1071581916300854,"[We, have, developed, a, systematic,, quantifi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,S0997754612001318,"[Many, applications, in, fluid, mechanics, hav...","[O, O, O, B, I, O, O, O, B, I, O, O, O, O, O, ..."
4,S0038092X15001681,"[For, the, reverse, current, analysis,, for, b...","[O, O, B, I, O, O, O, O, O, O, O, O, O, O, O, ..."


In [36]:
# Tạo dataframe mới
se2017_df = pd.DataFrame()

# Cột abstract là ghép các phần tử trong document thành chuỗi
se2017_df['abstract'] = all_se2017_df['document'].apply(' '.join)

# Cột keyword lấy các từ có doc_bio_tags = 'B'
se2017_df['keyword'] = all_se2017_df.apply(lambda x: keyword_list(x), axis=1)

In [37]:
se2017_df.head(5)

Unnamed: 0,abstract,keyword
0,It is well known that one of the long standing...,[quantum theory calculation of the static ener...
1,"In contrast with polymers, which are typically...",[the reaction of a gaseous carbon compound as ...
2,"We have developed a systematic, quantified und...","[designing services,, input devices, bespoke a..."
3,Many applications in fluid mechanics have show...,"[surface suction, numerical and asymptotic app..."
4,"For the reverse current analysis, for both sce...","[standard silicon system, high efficiency syst..."


In [38]:
print(se2017_df.iloc[0]['abstract'])
print(se2017_df.iloc[0]['keyword'])

It is well known that one of the long standing problems in physics is understanding the confinement physics from first principles. Hence the challenge is to develop analytical approaches which provide valuable insight and theoretical guidance. According to this viewpoint, an effective theory in which confining potentials are obtained as a consequence of spontaneous symmetry breaking of scale invariance has been developed [1]. In particular, it was shown that a such theory relies on a scale-invariant Lagrangian of the type [2] (1)L=14w2−12w−FμνaFaμν, where Fμνa=∂μAνa−∂νAμa+gfabcAμbAνc, and w is not a fundamental field but rather is a function of 4-index field strength, that is, (2)w=εμναβ∂μAναβ. The Aναβ equation of motion leads to (3)εμναβ∂βw−−FγδaFaγδ=0, which is then integrated to (4)w=−FμνaFaμν+M. It is easy to verify that the Aaμ equation of motion leads us to (5)∇μFaμν+MFaμν−FαβbFbαβ=0. It is worth stressing at this stage that the above equation can be obtained from the effective 

### Full Data

In [39]:
se2017_df.columns

Index(['abstract', 'keyword'], dtype='object')

In [43]:
# dataset_names
cols = list(se2017_df.columns)
# print(cols)

In [44]:
kp20k_df_cols = kp20k_df[cols]
inspec_df_cols = inspec_df[cols]
se2010_df_cols = se2010_df[cols]
se2017_df_cols = se2017_df[cols]

In [45]:
merged_df = pd.concat([inspec_df_cols, se2010_df_cols, se2017_df_cols], ignore_index=True)

In [46]:
merged_df.head(5)

Unnamed: 0,abstract,keyword
0,A conflict between language and atomistic info...,"[Content Atomism, philosophy of mind, IBS, cog..."
1,Selective representing and world-making We dis...,"[realism, Selective representing, selective re..."
2,Does classicism explain universality ? Argumen...,"[human cognition, connectionist models, univer..."
3,Separate accounts go mainstream -LSB- investme...,"[independent money managers, investment]"
4,Evolving receptive-field controllers for mobil...,"[nonlinear interactions, evolutionary methods,..."


In [47]:
len(merged_df)

2744

In [48]:
print(merged_df.iloc[0]['abstract'])
print(merged_df.iloc[0]['keyword'])

A conflict between language and atomistic information Fred Dretske and Jerry Fodor are responsible for popularizing three well-known theses in contemporary philosophy of mind : the thesis of Information-Based Semantics -LRB- IBS -RRB- , the thesis of Content Atomism -LRB- Atomism -RRB- and the thesis of the Language of Thought -LRB- LOT -RRB- . LOT concerns the semantically relevant structure of representations involved in cognitive states such as beliefs and desires . It maintains that all such representations must have syntactic structures mirroring the structure of their contents . IBS is a thesis about the nature of the relations that connect cognitive representations and their parts to their contents -LRB- semantic relations -RRB- . It holds that these relations supervene solely on relations of the kind that support information content , perhaps with some help from logical principles of combination . Atomism is a thesis about the nature of the content of simple symbols . It holds 

## Prepare Data

### Main

*Nên hay không nên bỏ title vào abstract*

In [49]:
bert_model_name = "google-bert/bert-base-uncased"

In [50]:
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [51]:
merged_df.iloc[0]['abstract']

"A conflict between language and atomistic information Fred Dretske and Jerry Fodor are responsible for popularizing three well-known theses in contemporary philosophy of mind : the thesis of Information-Based Semantics -LRB- IBS -RRB- , the thesis of Content Atomism -LRB- Atomism -RRB- and the thesis of the Language of Thought -LRB- LOT -RRB- . LOT concerns the semantically relevant structure of representations involved in cognitive states such as beliefs and desires . It maintains that all such representations must have syntactic structures mirroring the structure of their contents . IBS is a thesis about the nature of the relations that connect cognitive representations and their parts to their contents -LRB- semantic relations -RRB- . It holds that these relations supervene solely on relations of the kind that support information content , perhaps with some help from logical principles of combination . Atomism is a thesis about the nature of the content of simple symbols . It holds

In [None]:
# encoding = tokenizer(merged_df.iloc[0]['abstract'], return_tensors='pt')
# decoding = [tokenizer.decode(idx) for idx in encoding['input_ids'][0]]
# print(decoding[1:-1])
# print(len(decoding[1:-1]))

In [63]:
def generate_BIO_labels(abstract_tokens, keyword_tokens):
    # print("abstract_tokens", abstract_tokens)
    # print("keyword_tokens", keyword_tokens)
    BIO_labels = []

    # Tạo một danh sách các từ khóa mở đầu từ danh sách keyword_tokens
    start_keywords = [token[0] for token in keyword_tokens]
    # print(start_keywords)


    # Duyệt qua từng từ trong abstract_tokens
    idx_token = 0
    while idx_token<len(abstract_tokens):
        token = abstract_tokens[idx_token]
        # print(1)
        # print("token", token)
        # kiểm tra token có phải là token mở đầu không
        if token in start_keywords:
          # print(2)
          # tìm các vị trí trong keyword list bắt đầu bằng token trên
          # print("start_keywords", start_keywords)
          idx_keywords =[i for i, keyword in enumerate(start_keywords) if keyword==token]
          # print("idx_keywords",idx_keywords)
          # nếu không có keyword nào match thì sẽ gán cho token đó nhãn O
          correct = 0
          # với mỗi vị trí trong danh sách vị trí đã tìm được ở trên
          for idx_keyword in idx_keywords:
            # print(3)
            # lấy keyword match với index đã tìm thấy
            keyword_match = keyword_tokens[idx_keyword]
            # khởi tạo các biến vị trí của keyword_match và abstract_tokens đang xét
            k = 1
            new_idx = idx_token + 1
            if len(keyword_match) == 1 or len(abstract_tokens)==new_idx:
              BIO_labels.append('B')
              # print(BIO_labels)
              correct = 1
              break
            # print(len(abstract_tokens))
            # print(new_idx)
            # print(len(keyword_match))
            # print(k)
            # print(abstract_tokens[new_idx])
            # print(keyword_match[k])
            while abstract_tokens[new_idx]==keyword_match[k]:
              k+=1
              new_idx+=1
              if (k==len(keyword_match) or new_idx==len(abstract_tokens)):
                # print(4)
                BIO_labels.append('B')
                BIO_labels.extend(['I']*(k-1))
                idx_token += k-1
                # print(BIO_labels)
                correct = 1
                break
            else:
              BIO_labels.append('B')
              # print(BIO_labels)
              correct = 1
              break
            if correct==1:
              # print(5)
              # print('correct')
              break
        else:
            BIO_labels.append('O')
            # print(BIO_labels)
        idx_token += 1
        # print(BIO_labels)
        # print(len(BIO_labels))
        # print(idx_token)

    return BIO_labels

# Input
# abstract_token = ['selective', 'representing', 'and', 'world', 'making', 'we', 'discuss', 'the', 'thesis', 'of', 'selective', 'representing', 'the', 'idea', 'that', 'the', 'contents', 'of', 'the', 'mental', 'representations', 'had', 'by', 'organisms', 'are', 'highly', 'constrained', 'by', 'the', 'biological', 'niches', 'within', 'which', 'the', 'organisms', 'evolved', '.', 'while', 'such', 'a', 'thesis', 'has', 'been', 'defended', 'by', 'several', 'authors', 'elsewhere', ',', 'our', 'primary', 'concern', 'here', 'is', 'to', 'take', 'up', 'the', 'issue', 'of', 'the', 'compatibility', 'of', 'selective', 'representing', 'and', 'realism', '.', 'we', 'hope', 'to', 'show', 'three', 'things', '.', 'first', ',', 'that', 'the', 'notion', 'of', 'selective', 'representing', 'is', 'fully', 'consistent', 'with', 'the', 'realist', 'idea', 'of', 'a', 'mind', 'independent', 'world', '.', 'second', ',', 'that', 'not', 'only', 'are', 'these', 'two', 'consistent', ',', 'but', 'that', 'the', 'latter', 'lrb', 'the', 'realist', 'conception', 'of', 'a', 'mind', 'independent', 'world', 'rrb', 'provides', 'the', 'most', 'powerful', 'perspective', 'from', 'which', 'to', 'motivate', 'and', 'understand', 'the', 'differing', 'perceptual', 'and', 'cognitive', 'profiles', 'themselves', '.', 'third', ',', 'that', 'the', 'lrb', 'genuine', 'and', 'important', 'rrb', 'sense', 'in', 'which', 'organism', 'and', 'environment', 'may', 'together', 'constitute', 'an', 'integrated', 'system', 'of', 'scientific', 'interest', 'poses', 'no', 'additional', 'threat', 'to', 'the', 'realist', 'conception']
# keyword_tokens = [['mental', 'representations'], ['selective', 'representing'], ['selective', 'representing'], ['organisms'], ['cognitive', 'profiles'], ['realism']]

# abstract_token = ['temp', 'it', 'chief', 'rallies', 'troops', 'lsb', 'mori', 'rsb', 'the', 'appointment', 'of', 'a', 'highly', 'qualified', 'interim', 'it', 'manager', 'enabled', 'market', 'research', 'company', 'mori', 'to', 'rapidly', 'restructure', 'its', 'it', 'department', '.', 'now', 'the', 'resulting', 'improvements', 'are', 'allowing', 'it', 'to', 'support', 'an', 'increasing', 'role', 'for', 'technology', 'in', 'the', 'assimilation', 'and', 'analysis', 'of', 'market', 'research']
# keyword_tokens = [['mori'], ['interim', 'it', 'manager'], ['market', 'research', 'company']]
# Tạo BIO labels
# BIO_labels = generate_BIO_labels(abstract_token, keyword_tokens)

# In kết quả
# print(BIO_labels)

In [64]:
def token_process(token):
  if (token[-1]=='.' and len(token)!=1 and '.' not in token[:-1]):
    # print('after', [token[:-1], '.'])
    return token_process(token[:-1])+['.']
  elif token[-1] in [',', ':', '!', '?', ';', '"', "'", ')', ']', '}'] and token[0] in ["'", '"', '(', '[', '{'] and len(token)>2:
    # print('after', [token[0], token[1:-1], token[-1]])
    return [token[0], token[1:-1], token[-1]]
  elif token[-1] in [',', ':', '!', '?', ';', '"', "'", ')', ']', '}'] and len(token)!=1:
    # print('after', [token[:-1], token[-1]])
    return [token[:-1], token[-1]]
  elif token[0] in ["'", '"', '(', '[', '{'] and len(token)!=1:
    # print('after', [token[0], token[1:]])
    return [token[0], token[1:]]
  return [token]


def mySplit(text):
  # chia text thành danh sách các từ bằng dấu cách ' '
  text = text.lower()
  text = text.replace('-', ' ')
  tokens = text.split(' ')
  # loại bỏ các token rỗng và strip token
  tokens = [token.strip() for token in tokens if token!=""]
  # nếu từ đó kết thúc bằng dấu chấm '.' và bên trong từ đó không còn dấu chấm nào thì sẽ tách ra thành 2 từ đó và dấu chấm
  dot_tokens = []
  for token in tokens:
    # print('before', token)
    # print(token_process(token))
    dot_tokens.extend(token_process(token))
  return dot_tokens

def tokenizerSplit(text):
  encoding = tokenizer(text, return_tensors='pt')
  decoding = [tokenizer.decode(idx) for idx in encoding['input_ids'][0]]
  return decoding[1:-1]

def convInputOutput(abstract, keyword):
  # # chia abstract thành danh sách các từ bằng dấu cách ' '
  # abstract = abstract.replace('-', ' ')
  # tokens = abstract.split(' ')
  # # loại bỏ các token rỗng và strip token
  # tokens = [token.strip() for token in tokens if token!=""]
  # # nếu từ đó kết thúc bằng dấu chấm '.' và bên trong từ đó không còn dấu chấm nào thì sẽ tách ra thành 2 từ đó và dấu chấm
  # dot_tokens = []
  # for token in tokens:
  #   # print('before', token)
  #   # print(token_process(token))
  #   dot_tokens.extend(token_process(token))

  abstract_tokens = tokenizerSplit(abstract)
  keyword_tokens = [tokenizerSplit(keyword_iter) for keyword_iter in keyword]
  BIO_labels = generate_BIO_labels(abstract_tokens, keyword_tokens)
  return abstract_tokens, BIO_labels

kq1 = list(convInputOutput(merged_df.iloc[0]['abstract'], merged_df.iloc[0]['keyword']))
print(len(kq1[0]), len(kq1[1]))
kq2 = list(convInputOutput(merged_df.iloc[1]['abstract'], merged_df.iloc[1]['keyword']))
print(len(kq2[0]), len(kq2[1]))

265 265
196 196


In [65]:
merged_df.iloc[0]['abstract']

"A conflict between language and atomistic information Fred Dretske and Jerry Fodor are responsible for popularizing three well-known theses in contemporary philosophy of mind : the thesis of Information-Based Semantics -LRB- IBS -RRB- , the thesis of Content Atomism -LRB- Atomism -RRB- and the thesis of the Language of Thought -LRB- LOT -RRB- . LOT concerns the semantically relevant structure of representations involved in cognitive states such as beliefs and desires . It maintains that all such representations must have syntactic structures mirroring the structure of their contents . IBS is a thesis about the nature of the relations that connect cognitive representations and their parts to their contents -LRB- semantic relations -RRB- . It holds that these relations supervene solely on relations of the kind that support information content , perhaps with some help from logical principles of combination . Atomism is a thesis about the nature of the content of simple symbols . It holds

Từ df có 2 cột là abstract (abstract của bài báo KH) và keyword (danh sách các keyword nằm trong bài báo đó).

Chuyển thành df mới có 2 cột lần lượt là input (từng chữ trong abstract sau khi được tách riêng ra bằng dấu cách hoặc dấu chấm) và output (ứng với mỗi từ trong abstract đó sẽ có một nhãn thuộc BIO).

In [66]:
merged_df

Unnamed: 0,abstract,keyword,Abstract_inputs,BIO_labels
0,A conflict between language and atomistic info...,"[Content Atomism, philosophy of mind, IBS, cog...","[a, conflict, between, language, and, atom, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,Selective representing and world-making We dis...,"[realism, Selective representing, selective re...","[selective, representing, and, world, -, makin...","[B, I, O, O, O, O, O, O, O, O, O, B, I, O, O, ..."
2,Does classicism explain universality ? Argumen...,"[human cognition, connectionist models, univer...","[does, classic, ##ism, explain, universal, ##i...","[O, B, I, O, B, I, O, O, O, O, O, B, I, I, I, ..."
3,Separate accounts go mainstream -LSB- investme...,"[independent money managers, investment]","[separate, accounts, go, mainstream, -, l, ##s...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ..."
4,Evolving receptive-field controllers for mobil...,"[nonlinear interactions, evolutionary methods,...","[evolving, rec, ##eptive, -, field, controller...","[O, O, O, O, O, O, O, B, I, O, O, O, B, I, O, ..."
...,...,...,...,...
2739,Similar numerical oscillations to those descri...,"[Lagrangian description, IBM kernels, kernel i...","[similar, numerical, os, ##ci, ##llation, ##s,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, ..."
2740,It is interesting to quantify the effects of t...,"[gas exchange, Schmidt, chemical reaction, pra...","[it, is, interesting, to, quan, ##tify, the, e...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, B, ..."
2741,Numerical simulation of the gas flow through s...,"[(DSMC), direct simulation Monte Carlo method,...","[numerical, simulation, of, the, gas, flow, th...","[B, I, O, O, B, I, O, O, O, O, O, O, O, O, O, ..."
2742,After all micro elements reach a relaxed stead...,"[velocity, polynomials, Irving–Kirkwood relati...","[after, all, micro, elements, reach, a, relaxe...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [56]:
# def new_function(a, b):
#   print(a)
#   print(b)
#   return a, b

In [67]:
merged_df['Abstract_inputs'] = merged_df.apply(lambda row: convInputOutput(row['abstract'], row['keyword'])[0], axis=1)
merged_df['BIO_labels'] = merged_df.apply(lambda row: convInputOutput(row['abstract'], row['keyword'])[1], axis=1)

In [68]:
final_df = merged_df[['abstract', 'Abstract_inputs', 'BIO_labels']]

In [69]:
final_df

Unnamed: 0,abstract,Abstract_inputs,BIO_labels
0,A conflict between language and atomistic info...,"[a, conflict, between, language, and, atom, ##...","[O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, ..."
1,Selective representing and world-making We dis...,"[selective, representing, and, world, -, makin...","[B, I, O, O, O, O, O, O, O, O, O, B, I, O, O, ..."
2,Does classicism explain universality ? Argumen...,"[does, classic, ##ism, explain, universal, ##i...","[O, B, I, O, B, I, O, O, O, O, O, B, I, I, I, ..."
3,Separate accounts go mainstream -LSB- investme...,"[separate, accounts, go, mainstream, -, l, ##s...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ..."
4,Evolving receptive-field controllers for mobil...,"[evolving, rec, ##eptive, -, field, controller...","[O, O, O, O, O, O, O, B, I, O, O, O, B, I, O, ..."
...,...,...,...
2739,Similar numerical oscillations to those descri...,"[similar, numerical, os, ##ci, ##llation, ##s,...","[O, O, B, O, O, O, O, O, O, O, O, O, O, O, B, ..."
2740,It is interesting to quantify the effects of t...,"[it, is, interesting, to, quan, ##tify, the, e...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, B, ..."
2741,Numerical simulation of the gas flow through s...,"[numerical, simulation, of, the, gas, flow, th...","[B, I, O, B, B, I, O, O, O, O, O, O, O, O, O, ..."
2742,After all micro elements reach a relaxed stead...,"[after, all, micro, elements, reach, a, relaxe...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [70]:
print(merged_df.iloc[0]['abstract'])
print(merged_df.iloc[0]['Abstract_inputs'])
print(merged_df.iloc[0]['BIO_labels'])

A conflict between language and atomistic information Fred Dretske and Jerry Fodor are responsible for popularizing three well-known theses in contemporary philosophy of mind : the thesis of Information-Based Semantics -LRB- IBS -RRB- , the thesis of Content Atomism -LRB- Atomism -RRB- and the thesis of the Language of Thought -LRB- LOT -RRB- . LOT concerns the semantically relevant structure of representations involved in cognitive states such as beliefs and desires . It maintains that all such representations must have syntactic structures mirroring the structure of their contents . IBS is a thesis about the nature of the relations that connect cognitive representations and their parts to their contents -LRB- semantic relations -RRB- . It holds that these relations supervene solely on relations of the kind that support information content , perhaps with some help from logical principles of combination . Atomism is a thesis about the nature of the content of simple symbols . It holds 

In [71]:
final_df.to_csv('SaveData/final_df_new.csv', index=False)

### Nháp

In [None]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# # Download stopwords list (only need to do this once)
# nltk.download('stopwords')
# nltk.download('punkt')

# # Sample text
# text = "This paper proposes using virtual reality to enhance the perception of actions by distant users on a shared application. Here, distance may refer either to space ( e.g. in a remote synchronous collaboration) or time ( e.g. during playback of recorded actions). Our approach consists in immersing the application in a virtual inhabited 3D space and mimicking user actions by animating avatars. We illustrate this approach with two applications, the one for remote collaboration on a shared application and the other to playback recorded sequences of user actions. We suggest this could be a low cost enhancement for telepresence."

# # Tokenize the text
# words = word_tokenize(text)

# # Remove stopwords
# stop_words = set(stopwords.words('english'))
# filtered_words = [word for word in words if word.lower() not in stop_words]

# # Join the words back into a single string
# filtered_text = ' '.join(filtered_words)

# print(filtered_text)

## Load and Preprocess Data

In [72]:
final_df = pd.read_csv('SaveData/final_df_new.csv')

In [73]:
final_df.head(5)

Unnamed: 0,abstract,Abstract_inputs,BIO_labels
0,A conflict between language and atomistic info...,"['a', 'conflict', 'between', 'language', 'and'...","['O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', ..."
1,Selective representing and world-making We dis...,"['selective', 'representing', 'and', 'world', ...","['B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Does classicism explain universality ? Argumen...,"['does', 'classic', '##ism', 'explain', 'unive...","['O', 'B', 'I', 'O', 'B', 'I', 'O', 'O', 'O', ..."
3,Separate accounts go mainstream -LSB- investme...,"['separate', 'accounts', 'go', 'mainstream', '...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', ..."
4,Evolving receptive-field controllers for mobil...,"['evolving', 'rec', '##eptive', '-', 'field', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', ..."


In [74]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    abstracts = df['abstract'].tolist()
    texts = df['Abstract_inputs'].apply(lambda x: ast.literal_eval(x)).tolist()
    labels = df['BIO_labels'].apply(lambda x: ast.literal_eval(x)).tolist()
    return abstracts, texts, labels

In [75]:
data_file = 'SaveData/final_df.csv'
abstracts, texts, labels = load_imdb_data(data_file)

In [76]:
print(len(abstracts))
print(len(texts))
print(len(labels))

2744
2744
2744


In [77]:
print(abstracts[0:10])
print(texts[0:10])
print(labels[0:10])

["A conflict between language and atomistic information Fred Dretske and Jerry Fodor are responsible for popularizing three well-known theses in contemporary philosophy of mind : the thesis of Information-Based Semantics -LRB- IBS -RRB- , the thesis of Content Atomism -LRB- Atomism -RRB- and the thesis of the Language of Thought -LRB- LOT -RRB- . LOT concerns the semantically relevant structure of representations involved in cognitive states such as beliefs and desires . It maintains that all such representations must have syntactic structures mirroring the structure of their contents . IBS is a thesis about the nature of the relations that connect cognitive representations and their parts to their contents -LRB- semantic relations -RRB- . It holds that these relations supervene solely on relations of the kind that support information content , perhaps with some help from logical principles of combination . Atomism is a thesis about the nature of the content of simple symbols . It hold

In [78]:
print(len(abstracts[0]))
print(len(texts[0]))
print(len(labels[0]))

1289
265
265


In [79]:
import numpy as np
label_to_index = {'O': 0, 'B': 1, 'I': 2, 'P': 3}
def one_hot_labels(labels):
  one_hot_labels = np.zeros((len(labels), len(label_to_index)), dtype=int)
  for i, label in enumerate(labels):
      one_hot_labels[i, label_to_index[label]] = 1
  return one_hot_labels
print(one_hot_labels(['O', 'O', 'O', 'B', 'I', 'I', 'O']))

[[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [1 0 0 0]]


In [80]:
labels = [one_hot_labels(label) for label in labels]

In [81]:
print(len(texts[0]))
print(len(labels[0]))

265
265


In [82]:
for i in range(len(texts)):
  if len(texts[i])!=len(labels[i]):
    print(i)
    break

In [None]:
# encoding = tokenizer(' '.join(texts[0]), return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)

In [None]:
# print(encoding)

In [83]:
class TextClassificationDataset(Dataset):
    def __init__(self, abstracts, texts, labels, tokenizer, max_length):
        self.abstracts = abstracts
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        abstract = self.abstracts[idx]
        text = self.texts[idx]
        label = self.labels[idx]
        # print('label before', label)
        # label = np.concatenate((label, np.array(['D'] * (512-len(label)))))
        # print('label after', label)
        encoding = self.tokenizer(abstract, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        # decoding = [self.tokenizer.decode(idx) for idx in encoding['input_ids'][0]]
        return {'abstract': abstract, 'abstract_tokens': text, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'BIO_labels': torch.tensor(label)}
        # return {'abstract_tokens': encoding['input_ids'][0], 'BIO_labels': label}

## Model

### BERT Model

In [None]:
# input: abstract dạng mảng các phần tử là các token
# output: semantic embedding của từng phần tử trong abstract
# yêu cầu: độ dài của mảng các embedding bằng với độ dài của mảng các token.

In [84]:
class BERTEncoder(nn.Module):
  def __init__(self, bert_model_name):
    super(BERTEncoder, self).__init__()
    self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    self.bert = BertModel.from_pretrained(bert_model_name)
    for param in self.bert.parameters():
      param.requires_grad = True

  def forward(self, input_ids, attention_mask):
    # abstract_tokens = [tokenizer.decode(idx) for idx in abstract_tokens]
    # abstracts = " ".join(abstract_tokens)
    # inputs = self.tokenizer(abstracts, return_tensors="pt")
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    # inputs = self.tokenizer("Hello world!", return_tensors="pt").to('cuda')
    # output_test = self.bert(**inputs)
    # print(output_test.last_hidden_state)
    return outputs.last_hidden_state[0]

  def get_output_shape(self):
    return self.bert.config.hidden_size

### Model Feed Forward

In [None]:
# class BERTClassifier(nn.Module):
#     def __init__(self, bert_model_name, num_classes):
#         super(BERTClassifier, self).__init__()
#         self.bert = BertModel.from_pretrained(bert_model_name)
#         self.dropout = nn.Dropout(0.1)
#         self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.pooler_output
#         x = self.dropout(pooled_output)
#         logits = self.fc(x)
#         return logits

In [None]:
class FFClassifier(nn.Module):
    # input_shape: bert output shape, num_classes = 3 (BIO)
    def __init__(self, input_shape, hidden_size, num_classes):
        super(FFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_shape, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, token_embeds):
        x = torch.relu(self.fc1(token_embeds))
        x = self.fc2(x)
        x = self.softmax(x)
        return x

### Phraseformer Model

In [None]:
# input: abstract_tokens là mảng các token của abstract
# output: label của từng token là mảng BIO_labels có độ dài bằng với mảng abstract_tokens

In [None]:
class Phraseformer(nn.Module):
  def __init__(self, bert_model_name, is_train_bert, is_graph_embedding):
    super(Phraseformer, self).__init__()
    # self.bert_model_name = bert_model_name
    self.bertEmbed = BERTEncoder(bert_model_name)
    if is_train_bert:
      print("Có transfer learning bert")
    if is_graph_embedding:
      print("Có kết hợp graph embedding")
    self.ffclassifier = FFClassifier(self.bertEmbed.get_output_shape(), 100, 3)

  def forward(self, abstract_tokens, input_ids, attention_mask):
    bertEmbedding = self.bertEmbed(input_ids, attention_mask)
    labels = self.ffclassifier(bertEmbedding)
    return labels

## Training

### Initialize

In [None]:
# init hyperparameter of model
is_train_bert = False
is_graph_embedding = False
bert_model_name = "google-bert/bert-base-uncased"
max_length = 512
hidden_size = 100
num_classes = 3
batch_size = 1
num_epochs = 4
learning_rate = 2e-5

#### Cur

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    # print(0)
    for batch in data_loader:
        # print(1.1)
        optimizer.zero_grad()
        abstract_tokens = batch['abstract_tokens']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['BIO_labels'].to(device)
        # print(len(abstract_tokens))
        # print(len(input_ids[0]))
        # print(input_ids[0])
        # print(len(attention_mask[0]))
        # print(attention_mask[0])
        # print(len(labels[0]))
        # print(2)
        outputs = model(abstract_tokens=abstract_tokens, input_ids=input_ids, attention_mask=attention_mask)
        # print(3)
        # print('outputs', outputs)
        # print(len(outputs))
        labels = labels.float()
        # print('labels', labels)
        # print(len(labels[0]))
        labels = labels.view(-1, labels.size(-1)).argmax(dim=1)
        # print('after labels', labels)
        outputs_split = outputs[1:len(labels)+1]

        # old loss
        # loss = nn.CrossEntropyLoss()(outputs_split, labels)
        class_weights = torch.tensor([1, 1, 1], device=device, dtype=torch.float)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fn(outputs_split, labels)
        print(loss)

        # new loss
        # Tạo mask cho các vị trí trong labels bằng 0 và không phải 0
        # mask_zero = (labels == 0).float()
        # mask_non_zero = (labels != 0).float()

        # Tính toán loss cho các vị trí labels bằng 0
        # loss_fn = nn.CrossEntropyLoss(reduction='mean')
        # loss1 = loss_fn(outputs_split * mask_zero.unsqueeze(1), torch.zeros_like(labels, dtype=torch.long))
        # print("loss1:", outputs_split * mask_zero.unsqueeze(1))
        # print("loss1:", torch.zeros_like(labels, dtype=torch.long))
        # # Tính toán loss cho các vị trí labels khác 0
        # loss2 = loss_fn(outputs_split * mask_non_zero.unsqueeze(1), labels)
        # print("loss2:", outputs_split * mask_non_zero.unsqueeze(1))
        # print("loss2:", labels)

        # print("Loss1:", loss1)
        # print("Loss2:", loss2)

        # loss = loss1*0.01 + loss2*0.99

        # loss = nn.CrossEntropyLoss()(outputs[:len(labels[0])], labels[0].float())
        # print(4)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
# import torch
# import torch.nn as nn

# # Đầu ra dự đoán của mô hình
# outputs = torch.tensor([[0.9924, 0.0044, 0.0032],
#                         [0.9934, 0.0038, 0.0028],
#                         [0.9936, 0.0036, 0.0028]])
# #### eval
# _, preds = torch.max(outputs, dim=1)
# print(preds)
# # # Nhãn được biểu diễn dưới dạng one-hot encoding
# # labels = torch.tensor([[[1, 0, 0],
# #                         [1, 0, 0],
# #                         [0, 1, 0]]], dtype=torch.float32)

# # # Định dạng lại nhãn để phù hợp với đầu ra
# # labels = labels.view(-1, labels.size(-1)).argmax(dim=1)
# # print(labels)
# # # Tính toán hàm mất mát
# # loss_fn = nn.CrossEntropyLoss()
# # loss = loss_fn(outputs, labels)

# # print(loss)


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            abstract_tokens = batch['abstract_tokens']
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['BIO_labels'].to(device)
            outputs = model(abstract_tokens=abstract_tokens, input_ids=input_ids, attention_mask=attention_mask)
            labels = labels.float()
            labels = labels.view(-1, labels.size(-1)).argmax(dim=1)
            outputs = outputs[1:len(labels)+1]
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
# max_length_abs = 512
# Loại bỏ các mảng dữ liệu có len(texts) >= 510
abstracts_filtered = []
texts_filtered = []
labels_filtered = []

for abstract, text, label in zip(abstracts, texts, labels):
    if len(text) < 510:  # Chỉ giữ lại các mảng có độ dài nhỏ hơn 510
        abstracts_filtered.append(abstract)
        texts_filtered.append(text)
        labels_filtered.append(label)

In [None]:
print(len(texts_filtered))

2739


In [None]:
# train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

combined_data = list(zip(abstracts_filtered, texts_filtered))
# Chia dữ liệu kết hợp thành tập huấn luyện và tập kiểm tra
train_data, val_data, train_labels, val_labels = train_test_split(combined_data, labels_filtered, test_size=0.2, random_state=42)
# Tách dữ liệu của "abstracts" và "texts" sau khi chia
train_abstracts, train_texts = zip(*train_data)
val_abstracts, val_texts = zip(*val_data)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_abstracts, train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_abstracts, val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = Phraseformer(bert_model_name, is_train_bert, is_graph_embedding).to(device)

cuda


In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



#### Cur

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
tensor(1.0803, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0769, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0770, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0769, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0779, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0691, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0529, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0762, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0584, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0550, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0489, device='cuda:0'

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8630
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     85102
           1       0.00      0.00      0.00      4599
           2       0.00      0.00      0.00      8907

    accuracy                           0.86     98608
   macro avg       0.29      0.33      0.31     98608
weighted avg       0.74      0.86      0.80     98608

Epoch 2/4
tensor(0.7702, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6927, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6558, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7882, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6358, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6158, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7193, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7539, device='cuda:0', grad_fn=<

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8630
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     85102
           1       0.00      0.00      0.00      4599
           2       0.00      0.00      0.00      8907

    accuracy                           0.86     98608
   macro avg       0.29      0.33      0.31     98608
weighted avg       0.74      0.86      0.80     98608

Epoch 3/4
tensor(0.5681, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6748, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7169, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7926, device='cuda:0', grad_fn=<

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8630
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     85102
           1       0.00      0.00      0.00      4599
           2       0.00      0.00      0.00      8907

    accuracy                           0.86     98608
   macro avg       0.29      0.33      0.31     98608
weighted avg       0.74      0.86      0.80     98608

Epoch 4/4
tensor(0.7549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6760, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7419, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6300, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7098, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7374, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7076, device='cuda:0', grad_fn=<

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Save Model

In [None]:
# model BERT + 199
# model_path = "./SaveModel/NO_BERT_199.pth"
# model BERT + 155
model_path = "./SaveModel/NO_BERT_111.pth"

In [None]:
torch.save(model.state_dict(), model_path)

### Load Model

In [None]:
model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))

<All keys matched successfully>

## Inference

Validation

In [None]:
def extract_keywords_id(abstract_tokens, preds):
    keywords = []
    current_keyword = []
    for token, pred in zip(abstract_tokens, preds):
        if pred == 1:  # Nhãn thể hiện token bắt đầu một keyword
            if current_keyword != []:
                keywords.append(current_keyword)
            current_keyword = [token]  # Token đầu tiên của keyword
        elif pred == 2:  # Nhãn thể hiện token bên trong keyword
            current_keyword.append(token)  # Thêm token vào keyword
    if current_keyword != []:
        keywords.append(current_keyword)
    list_keyword = []
    for one_keyword in keywords:
        decoded_sequence = tokenizer.decode(one_keyword)
        list_keyword.append(decoded_sequence)
    return list_keyword

In [None]:
def extract_keywords(abstract_tokens, preds):
    keywords = []
    current_keyword = ""
    for token, pred in zip(abstract_tokens, preds):
        if pred == 1:  # Nhãn thể hiện token bắt đầu một keyword
            if current_keyword != "":
                keywords.append(current_keyword)
            current_keyword = token[0]  # Token đầu tiên của keyword
        elif pred == 2:  # Nhãn thể hiện token bên trong keyword
            current_keyword += " " + token[0]  # Thêm token vào keyword
    if current_keyword != "":
        keywords.append(current_keyword)
    return keywords

# Sử dụng hàm extract_keywords
abstract_tokens = [('learning',), ('non',), ('##re',), ('##gul',), ('##ar',), ('languages',), (':',), ('a',), ('comparison',), ('of',), ('simple',), ('rec',), ('##urrent',), ('networks',), ('and',), ('l',), ('##st',), ('##m',), ('rodriguez',), ('-',), ('l',), ('##rb',), ('-',), ('2001',), ('-',), ('rr',), ('##b',), ('-',), ('examined',), ('the',), ('learning',), ('ability',), ('of',), ('simple',), ('rec',), ('##urrent',), ('nets',), ('-',), ('l',), ('##rb',), ('-',), ('sr',), ('##ns',), ('-',), ('rr',), ('##b',), ('-',), ('-',), ('l',), ('##rb',), ('-',), ('elm',), ('##an',), (',',), ('1990',), ('-',), ('rr',), ('##b',), ('-',), ('on',), ('simple',), ('context',), ('-',), ('sensitive',), ('and',), ('context',), ('-',), ('free',), ('languages',), ('.',), ('in',), ('response',), ('to',), ('rodriguez',), ("'",), ('s',), ('-',), ('l',), ('##rb',), ('-',), ('2001',), ('-',), ('rr',), ('##b',), ('-',), ('article',), (',',), ('we',), ('compare',), ('the',), ('performance',), ('of',), ('simple',), ('rec',), ('##urrent',), ('nets',), ('and',), ('long',), ('short',), ('-',), ('term',), ('memory',), ('rec',), ('##urrent',), ('nets',), ('on',), ('context',), ('-',), ('free',), ('and',), ('context',), ('-',), ('sensitive',), ('languages',)]
preds = torch.tensor([1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 1, 0, 2, 2, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2,
        0, 1, 1, 0, 2, 1, 2, 2, 2, 0, 1, 0, 2, 0, 1, 0, 2, 2]).tolist()

keywords = extract_keywords(abstract_tokens, preds)
print(keywords)


['learning', 'non ##re ##gul ##ar languages', 'simple rec ##urrent networks', 'l ##st ##m', 'learning ability', 'simple rec ##urrent nets ##rb ##ns', 'elm ##an', 'simple', 'context sensitive', 'context free languages', 'rodriguez ##rb', 'simple rec ##urrent nets', 'long', 'short term', 'memory rec ##urrent nets', 'context free', 'context sensitive languages']


In [None]:
def evaluate_batch(model, batch, device):
    with torch.no_grad():
        abstract_tokens = batch['abstract_tokens']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['BIO_labels'].to(device)
        outputs = model(abstract_tokens=abstract_tokens, input_ids=input_ids, attention_mask=attention_mask)
        labels = labels.float()
        labels = labels.view(-1, labels.size(-1)).argmax(dim=1)
        outputs = outputs[1:len(labels)+1]
        _, preds = torch.max(outputs, dim=1)
        # print(abstract_tokens)
        # print(input_ids)
        # print(input_ids[0][1:len(labels)+1])
        # print(preds)
        # print(labels)
        predict_keyword = extract_keywords(abstract_tokens, preds)
        labels_keyword = extract_keywords(abstract_tokens, labels)
        # print(predict_keyword)
        # print(labels_keyword)
        preds_Id = extract_keywords_id(input_ids[0][1:len(labels)+1], preds)
        labels_Id = extract_keywords_id(input_ids[0][1:len(labels)+1], labels)
        # print(preds_Id)
        # print(labels_Id)
    return preds_Id, labels_Id

### Run

In [None]:
val_iterator = iter(val_dataloader)
batch = next(val_iterator)
preds_Id, labels_Id = evaluate_batch(model, batch, device)
print(preds_Id)
print(labels_Id)
batch = next(val_iterator)
preds_Id, labels_Id = evaluate_batch(model, batch, device)
print(preds_Id)
print(labels_Id)
batch = next(val_iterator)
preds_Id, labels_Id = evaluate_batch(model, batch, device)
print(preds_Id)
print(labels_Id)

[]
['lstm', 'performance']
[]
['variable fractional delay allpass filters', 'variable fractional delay allpass filters', 'fractional delay parameter', 'weighted equation error', 'cost function', 'weighted equation error', 'optimal polynomial coefficients', 'linear simultaneous equations']
[]
['even unimodular integral lattices', 'gaussian integers', 'automorphisms', 'niemeier lattices']


In [None]:
# tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

In [None]:
# abs = final_df.iloc[1]['abstract']

## Evaluate

In [None]:
def F1_score(preds_Id, labels_Id):
    TP = len(set(preds_Id) & set(labels_Id))
    FP = len(set(preds_Id) - set(labels_Id))
    FN = len(set(labels_Id) - set(preds_Id))

    try:
        P = TP / (TP + FP)
        R = TP / (TP + FN)
    except:
        # F1 = None
        return None

    if (P!=0 and R!=0):
        F1 = 2 * (P * R) / (P + R)
    else:
        F1 = None
    return F1

In [None]:
# preds_Id = ['learning', 'nonregular languages', 'simple recurrent networks', 'lstm', 'learning ability', 'simple recurrent netsrbns', 'elman', 'simple', 'context sensitive', 'context free languages', 'rodriguezrb', 'simple recurrent nets', 'long', 'short term', 'memory recurrent nets', 'context free', 'context sensitive languages']
# labels_Id = ['lstm', 'performance']
# preds_Id = ['design', 'variable', 'fractional delay allpass filters', 'weighted least squares method', 'weighted least method', 'variable fractional delay allpass filters coefficient', 'variable allpass filter polynomial', 'fractional delay parameter', 'nonlinear phase error', 'weighted equation error', 'cost functionratic form', 'weighted equation error', 'optimal', 'polynomial coefficients', 'linear simultaneous equations']
# labels_Id = ['variable fractional delay allpass filters', 'variable fractional delay allpass filters', 'fractional delay parameter', 'weighted equation error', 'cost function', 'weighted equation error', 'optimal polynomial coefficients', 'linear simultaneous equations']
preds_Id = ['even', 'unimodular', 'gaussian lattices', 'unimodular', 'gaussian lattices', 'unimodular integral lattices', 'gaussian integers classification', 'automorphisms', 'tau automorphism groups', 'niemeier lattices', 'unimodular', 'real', 'integral lattices', 'even', 'unimodular gaussian lattices', 'equivalence']
labels_Id = ['even unimodular integral lattices', 'gaussian integers', 'automorphisms', 'niemeier lattices']

TP = len(set(preds_Id) & set(labels_Id))
FP = len(set(preds_Id) - set(labels_Id))
FN = len(set(labels_Id) - set(preds_Id))
try:
    P = TP / (TP + FP)
    R = TP / (TP + FN)
except:
    F1 = None

if (P!=0 and R!=0):
    F1 = 2 * (P * R) / (P + R)
else:
    F1 = None

print("Precision:", P)
print("Recall:", R)
print("F1 score:", F1)

Precision: 0.16666666666666666
Recall: 0.5
F1 score: 0.25


In [None]:
print("F1: ", F1_score(preds_Id, labels_Id))

F1:  0.25


In [None]:
def F1_batch(model, batch, device):
    with torch.no_grad():
        abstract_tokens = batch['abstract_tokens']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['BIO_labels'].to(device)
        outputs = model(abstract_tokens=abstract_tokens, input_ids=input_ids, attention_mask=attention_mask)
        labels = labels.float()
        labels = labels.view(-1, labels.size(-1)).argmax(dim=1)
        outputs = outputs[1:len(labels)+1]
        _, preds = torch.max(outputs, dim=1)
        # print(abstract_tokens)
        # print(input_ids)
        # print(input_ids[0][1:len(labels)+1])
        # print(preds)
        # print(labels)
        predict_keyword = extract_keywords(abstract_tokens, preds)
        labels_keyword = extract_keywords(abstract_tokens, labels)
        # print(predict_keyword)
        # print(labels_keyword)
        preds_Id = extract_keywords_id(input_ids[0][1:len(labels)+1], preds)
        labels_Id = extract_keywords_id(input_ids[0][1:len(labels)+1], labels)
        # print(preds_Id)
        # print(labels_Id)
    return F1_score(preds_Id, labels_Id)

### Run

In [None]:
print(len(val_dataloader))
F1_list = []
for batch in val_dataloader:
    F1 = F1_batch(model, batch, device)
    if (F1!=None):
        F1_list.append(F1)

548


In [None]:
print(F1_list)

[]


In [None]:
sum = 0
for F1 in F1_list:
  sum += F1
print(sum)
print(sum/len(F1_list))

0


ZeroDivisionError: division by zero