In [20]:
import pandas as pd
import numpy as np
from tqdm import trange
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import torch
import torch.nn as nn
import torch.functional as F
import torchcrf as CRF
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import BertPreTrainedModel, BertModel, BertTokenizer, BertConfig, AdamW

import transformers
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import f1_score, accuracy_score, classification_report

torch.cuda.set_device(0)

In [2]:
# 参数设置
# seed = 2021
# max_len = 75
# batch_size = 32
# learning_rate = 3e-5
# epochs = 20
# max_grad_norm = 1.0
# test_size = 0.1
# earlystopping_patience = 3
model_name = "bert-base-cased"
checkpoint_path = "add-lstm-crf-checkpoint.cpt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def build_corpus1(data_dir):
    """获得句子及标签信息"""
    word_lists = []
    tag_lists = []
    words = []
    tags = []
    data = pd.read_csv(data_dir, header=None, encoding='utf-8')
    word = data.iloc[:, 0]
    tag = data.iloc[:, 2]
    for i in range(len(word)):
        if word[i] != " ":
            words.append(word[i])
            tags.append(tag[i])
        elif word[i] == " ":
            word_lists.append(words)
            tag_lists.append(tags)
            words = []
            tags = []
    return word_lists, tag_lists

def build_corpus2(data_dir):
    word_lists = []
    tag_lists = []
    with open(data_dir, "r", encoding="utf-8") as f:
        words = []
        tags = []
        for line in f:
            if line != "\n":
                word = line.split(" ")[0]
                tag = line.split(" ")[1].strip("\n")
                words.append(word)
                tags.append(tag)
            else:
                word_lists.append(words)
                tag_lists.append(tags)
                words = []
                tags = []
    
    return word_lists, tag_lists

# df = pd.read_csv("./data/DA对比实验数据/top_1k_augmented_sentences.csv", header=None, encoding="utf-8")
# sentences, labels = build_corpus1("./data/DA对比实验数据/top_1k_augmented_sentences.csv")
# sentences, labels = build_corpus2("./mat_data.txt")
df = pd.read_csv("./data_aug.csv", header=None, encoding="utf-8")
sentences, labels = build_corpus1("./data_aug.csv")

In [4]:
# Ours 数据集
tag_values = list(set(df.iloc[:, 2].values))
tag_values.append("PAD")
tag_values.remove(" ")
tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag_values)
print(tag2idx)

# ceder 数据集
# tag_values.append("PAD")
# tag2idx = {t: i for i, t in enumerate(tag_values)}
# print(tag_values)
# print(tag2idx)

['B-Composition', 'I-Property', 'I-Characterization', 'B-Characterization', 'B-Processing', 'B-Structure', 'B-Application', 'I-Composition', 'I-Structure', 'I-Application', 'B-Property', 'I-Processing', 'I-Condition', 'B-Feature', 'B-Condition', 'O', 'I-Feature', 'PAD']
{'B-Composition': 0, 'I-Property': 1, 'I-Characterization': 2, 'B-Characterization': 3, 'B-Processing': 4, 'B-Structure': 5, 'B-Application': 6, 'I-Composition': 7, 'I-Structure': 8, 'I-Application': 9, 'B-Property': 10, 'I-Processing': 11, 'I-Condition': 12, 'B-Feature': 13, 'B-Condition': 14, 'O': 15, 'I-Feature': 16, 'PAD': 17}


In [5]:
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)

In [6]:
class BERT_BiLSTM_CRF(BertPreTrainedModel):
    def __init__(self, config, need_birnn=False, rnn_dim=128):
        super(BERT_BiLSTM_CRF, self).__init__(config)
        
        self.num_tags = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        out_dim = config.hidden_size
        self.need_birnn = need_birnn

        # 如果为False，则不要BiLSTM层
        if need_birnn:
            self.birnn = nn.LSTM(config.hidden_size, rnn_dim, num_layers=1, bidirectional=True, batch_first=True)
            out_dim = rnn_dim*2
        
        self.hidden2tag = nn.Linear(out_dim, config.num_labels)
        self.crf = CRF(config.num_labels, batch_first=True)
    

    def forward(self, input_ids, tags=None, token_type_ids=None, input_mask=None):
        outputs = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask)
        sequence_output = outputs[0]
        if self.need_birnn:
            sequence_output, _ = self.birnn(sequence_output)

        sequence_output = self.dropout(sequence_output)
        emissions = self.hidden2tag(sequence_output)
        
        outputs = (emissions,) + outputs[2:]
        if tags is not None:
            loss = -1*self.crf(emissions, tags, mask=input_mask.byte())
            outputs = (loss,) + outputs
            
        return outputs

In [7]:
torch.cuda.empty_cache()
model = torch.load(checkpoint_path)



# 预测

In [8]:
import pandas as pd
import os
import nltk
import re
from nltk import sent_tokenize, word_tokenize

In [9]:
# 获取所有txt格式文献
dir_all = "./result_literatures/"
all_text_name_list = []
for home, dirs, files in os.walk(dir_all):
    for filename in files:
        fullname = os.path.join(home, filename)
        all_text_name_list.append(fullname)
all_text_name_list = all_text_name_list[:20]
all_text_name_list

['./result_literatures/100.txt',
 './result_literatures/110.txt',
 './result_literatures/120.txt',
 './result_literatures/132.txt',
 './result_literatures/137.txt',
 './result_literatures/304.txt',
 './result_literatures/46.txt',
 './result_literatures/67.txt',
 './result_literatures/82.txt']

In [10]:
# 
sigle_sent = []
remove_xkh = r"[(](.*)[)]" 
remove_zkh = r"[\[](.*)[\]]"
remove_dkh = r"[\{](.*)[\}]"
remove_ex1 = r"[\d\)]"

In [11]:
# 处理文本得到单个句子列表
count = 0
for file in all_text_name_list:
    try:
        with open(file, "r", encoding="utf-8") as f:
            raw_sent_lines = f.readlines()
            sent_size = len(raw_sent_lines)
            index = 0
            cur_sent = ""
            while index < sent_size:
                cur = raw_sent_lines[index].strip()

                if cur != "":
                    cur_sent += (cur + " ")
                    index += 1

                elif cur == "" and cur_sent != "":
                    cur_sent_list = sent_tokenize(cur_sent.strip())
                    sigle_sent.extend(cur_sent_list)
                    cur_sent = ""
                    index += 1

                else:
                    index += 1
            count += 1
            print(file + " ======>已完成!")
    except:
        pass

print(count)

result_sent = []
for sent in sigle_sent:
    if len(sent.split(" ")) <= 8:
        continue
    new_sent1 = re.sub(remove_xkh, '', sent)
    new_sent2 = re.sub(remove_zkh, '', new_sent1)
    new_sent3 = re.sub(remove_dkh, '', new_sent2)
    new_sent4 = re.sub(remove_ex1, '', new_sent3)
    result_sent.append(new_sent4)

9


In [12]:
len(result_sent)

358

In [13]:
knowledge = {"Composition": {}, "Structure": {}, "Property": {}, "Processing": {}, 
             "Feature": {}, "Application": {}, "Characterization": {}, "Condition": {}}

In [14]:
def add_new_sent_to_knowledge(sent):
    try:
        tokenized_sentence = tokenizer.encode(sent)
        input_ids = torch.LongTensor([tokenized_sentence])
        input_ids = input_ids.to(device)
        with torch.no_grad():
            output = model(input_ids, tags=None)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
        # join bpe split tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)
    

        tmp_test = [new_tokens, new_labels]
        now_sentence = " ".join(tmp_test[0])
        descriptor_index = []
        for i in range(len(tmp_test[0])):
            if tmp_test[1][i][0] == "B":
                descriptor_index.append(i)
        descriptor_index.append(len(tmp_test[0]))
        descriptor = []
        descriptor_type = []
        for i in range(len(descriptor_index)-1):
            type_ = tmp_test[1][descriptor_index[i]].split("-")[-1]
            descriptor_type.append(type_)
            descriptor_word = []
            for j in range(descriptor_index[i], descriptor_index[i+1]):
                if tmp_test[1][j][0] != "O":
                    descriptor_word.append(tmp_test[0][j])            
            descriptor.append(descriptor_word)
        descriptors = [" ".join(d) for d in descriptor]
        for i in range(len(descriptors)):
            if descriptors[i] not in knowledge[descriptor_type[i]]:
                knowledge[descriptor_type[i]][descriptors[i]] = [now_sentence]
            else:
                knowledge[descriptor_type[i]][descriptors[i]].append(now_sentence)
    except:
        pass

In [16]:
for sent in result_sent:
    add_new_sent_to_knowledge(sent)

KeyError: 0

In [18]:
knowledge['Composition']

{'Na': ['There are two types of concerted migration mechanisms : two Na + ions located at the adjacent Na and Na sites can migrate either in the same direction or at an angle .',
  'Both mechanisms exhibit relatively low migration barriers owing to the potential energy conversion during the Na + ions migration process .',
  'Redistribution of Na + ions from the most stable Na sites to Na on increasing Na + total content further facilitates the concerted migration and promotes the Na + ion mobility .',
  'argued that the Na – Na channel is the only choice for Na + ion diffusion and limited by two triangular bottlenecks in the Na – Na channel .',
  'Additionally , we reveal concerted migration mechanisms of Na + ions in this system via an improved correlated jump analysis .',
  'Furthermore , we elucidate the relationship between the Na + content , its distribution , multiple Na + ions concerted migration , and overall Na + diffusion properties .',
  'We then outline a rational optimizat

In [19]:
save_result = []

for k in knowledge.keys():
    for key, value in knowledge[k].items():
        for i in range(len(value)):
            save_result.append([key + " (" + k + ")", value[i]])
            
df_new_base = pd.DataFrame(save_result)
df_new_base.to_excel("./lyz_knowledge_base_with_tag.xlsx", encoding='utf-8', index=False, header=False)