In [None]:
import torch
from torch import nn
import random
import os
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Subset
from MatSciBERT.normalize_text import normalize
from transformers import AutoModel, AutoTokenizer, AutoConfig


def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
## 设置随机数种子
setup_seed(42)

config = AutoConfig.from_pretrained('./MatSciBERT')
config.max_position_embeddings = 900
bert_model = AutoModel.from_pretrained('./MatSciBERT', config=config, ignore_mismatched_sizes=True)


class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 3)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        outputs = self.bert(input_ids=input_id, attention_mask=mask,return_dict=True, output_attentions=True)
        pooled_output = outputs.pooler_output
        attentions = outputs.attentions
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer, attentions


## 数据获取
tokenizer = AutoTokenizer.from_pretrained('./MatSciBERT')
def find_text(composition):
    file_path = os.path.join('../description/', composition + '.txt')
    with open(file_path, 'r') as file:
        text = file.read()
    return text



use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

## 模型读取
from torch.serialization import load
model_path = 'MgBERT.pth'
model_data = torch.load(model_path, map_location=device)
model = BertClassifier()
model.to(device)
model.load_state_dict(model_data)
model.eval()

In [None]:
def inference(comp):
    input_text = find_text(comp)
    inputs = tokenizer(normalize(input_text),
                                padding='max_length', 
                                max_length = 900, 
                                truncation=True,
                                return_tensors="pt").to(device)
    output, attention = model(inputs['input_ids'], inputs['attention_mask'])
    return output.argmax(dim=1)

In [None]:
#加载Composition数据
import pandas as pd
df = pd.read_csv('../unique_compositions.csv')
composition_list = df['composition'].to_list()
labels = {'BMG': 0,
          'Ribbon': 1,
          'NR': 2
          }
label_list = [labels[label] for label in df['glass_forming_category']]
length = len(composition_list)

In [None]:
import numpy as np
zero_matrix = np.zeros((3, 3))

In [None]:
for i in range(length):
    pred = inference(composition_list[i])
    if pred == label_list[i]:
        zero_matrix[pred][pred] += 1
    elif pred != label_list[i]:
        zero_matrix[label_list[i]][pred] += 1
print(zero_matrix)
df_zero_matrix = pd.DataFrame(zero_matrix, columns=['r_BMG', 'r_Ribbon', 'r_NR'], index=['p_BMG', 'p_Ribbon', 'p_NR'])
df_zero_matrix.to_excel('zero_matrix.xlsx')


# other task

In [None]:
def inference(comp):
    input_text = find_text(comp)
    inputs = tokenizer(normalize(input_text),
                                padding='max_length', 
                                max_length = 900, 
                                truncation=True,
                                return_tensors="pt").to(device)
    output, attention = model(inputs['input_ids'], inputs['attention_mask'])
    return output.argmax(dim=1)

In [None]:
#加载Composition数据
import pandas as pd
df = pd.read_csv('train_dataset.csv')
composition_list = df['composition'].to_list()
labels = {'BMG': 0,
          'Ribbon': 1,
          'NR': 2
          }
label_list = [labels[label] for label in df['glass_forming_category']]
length = len(composition_list)

In [None]:
count_BMG = sum(1 for label in label_list if label == 0)
print('BMG in test dataset: ', count_BMG)

In [None]:
count_Ribbon = sum(1 for label in label_list if label == 1)
print('Ribbon in test dataset: ', count_Ribbon)

In [None]:
print(length)

In [None]:
cnt = 0
for i in range(length):
    pred = inference(composition_list[i])
    if pred == label_list[i] == 0:
        cnt += 1
    if pred == label_list[i] == 1:
        cnt += 1
print(cnt)

# matscibert test

In [None]:
import torch
from torch import nn
import random
import os
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Subset
from MatSciBERT.normalize_text import normalize
from transformers import AutoModel, AutoTokenizer, AutoConfig


def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
## 设置随机数种子
setup_seed(42)

config = AutoConfig.from_pretrained('./MatSciBERT')
bert_model = AutoModel.from_pretrained('./MatSciBERT', config=config, ignore_mismatched_sizes=True)


class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 3)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        outputs = self.bert(input_ids=input_id, attention_mask=mask,return_dict=True, output_attentions=True)
        pooled_output = outputs.pooler_output
        attentions = outputs.attentions
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer, attentions


## 数据获取
tokenizer = AutoTokenizer.from_pretrained('./MatSciBERT')
def find_text(composition):
    file_path = os.path.join('../description/', composition + '.txt')
    with open(file_path, 'r') as file:
        text = file.read()
    return text



use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

## 模型读取
from torch.serialization import load
model_path = './prompt_baseline.pth'
model_data = torch.load(model_path, map_location=device)
model = BertClassifier()
model.to(device)
model.load_state_dict(model_data)
model.eval()

In [None]:
def inference(comp):
    input_text = find_text(comp)
    inputs = tokenizer(normalize(input_text),
                                padding='max_length', 
                                max_length = 512, 
                                truncation=True,
                                return_tensors="pt").to(device)
    output, attention = model(inputs['input_ids'], inputs['attention_mask'])
    return output.argmax(dim=1)

In [None]:
#加载Composition数据
import pandas as pd
df = pd.read_csv('train_dataset.csv')
composition_list = df['composition'].to_list()
labels = {'BMG': 0,
          'Ribbon': 1,
          'NR': 2
          }
label_list = [labels[label] for label in df['glass_forming_category']]
length = len(composition_list)

In [None]:
print(length)

In [None]:
count_BMG = sum(1 for label in label_list if label == 0)
print('BMG in test dataset: ', count_BMG)

In [None]:
count_Ribbon = sum(1 for label in label_list if label == 1)
print('Ribbon in test dataset: ', count_Ribbon)

In [None]:
cnt = 0
for i in range(length):
    pred = inference(composition_list[i])
    if pred == label_list[i]:
        cnt += 1
    # if pred == label_list[i] == 1:
    #     cnt += 1
print(cnt)