In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import BertModel
from tokenizers import BertWordPieceTokenizer

import torch
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader, Dataset

import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report



In [2]:
device = None

import torch
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print (device)
else:
    print ("MPS device not found.")

mps


In [8]:
# 모델 및 토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained("fabriceyhc/bert-base-uncased-yahoo_answers_topics")
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [9]:
# test 데이터 가져오기
test_data = "./yahoo_answers_csv/test.csv"
test_df = pd.read_csv(test_data)

sentence = test_df.iloc[0, 1] + " " + test_df.iloc[0, 2] + " " + test_df.iloc[0, 3]
true_label = test_df.iloc[0, 0]
print(f"sentence: {sentence}")

inputs = tokenizer(sentence, return_tensors="pt").to(device)

token_length = inputs.input_ids.shape[1]
print(f"token length: {token_length}\n")



# 모델에 입력값 넣기
outputs = model(**inputs)

predictions = outputs.logits.argmax(dim=-1)
print(f"pred output: {predictions.item() + 1}")
print(f"true label: {true_label}")
print(outputs.logits)



"""
class 정보

1: Society & Culture
2: Science & Mathematics
3: Health
4: Education & Reference
5: Computers & Internet
6: Sports
7: Business & Finance
8: Entertainment & Music
9: Family & Relationships
10: Politics & Government

"""


sentence: Why does Zebras have stripes? What is the purpose or those stripes? Who do they serve the Zebras in the wild life? this provides camouflage - predator vision is such that it is usually difficult for them to see complex patterns
token length: 48

pred output: 2
true label: 2
tensor([[ 0.7546,  3.1281, -0.6842,  2.3181, -1.2996, -0.6507,  1.6057, -1.0207,
         -1.7158, -0.7543]], device='mps:0', grad_fn=<LinearBackward0>)


'\nclass 정보\n\n1: Society & Culture\n2: Science & Mathematics\n3: Health\n4: Education & Reference\n5: Computers & Internet\n6: Sports\n7: Business & Finance\n8: Entertainment & Music\n9: Family & Relationships\n10: Politics & Government\n\n'

In [10]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.sentences = df.iloc[:, 1:].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).values
        self.labels = df.iloc[:, 0].values
        self.tokenizer = tokenizer
  
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        inputs = self.tokenizer(sentence, truncation=True, max_length=512, padding='max_length', return_tensors="pt")
        label = torch.tensor(self.labels[idx]) # label은 1부터 시작하기 때문에, 나중에 inference할 때에 예측값에 1을 더해줘야 합니다.
        
        return inputs, label

In [11]:
test_data = "./yahoo_answers_csv/test.csv"
test_df = pd.read_csv(test_data, header=None)

test_dataset = TestDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [12]:
# 1번째 레이어 1번 헤드 끄기

print(model.bert.encoder.layer[0].attention.self.value.weight.data)
model.bert.encoder.layer[0].attention.self.value.weight.data[:, :64] = 0
print(model.bert.encoder.layer[0].attention.self.value.weight.data)

tensor([[ 0.0105,  0.0022, -0.0111,  ...,  0.0232, -0.0273,  0.0001],
        [-0.0270, -0.0122, -0.0419,  ...,  0.0602, -0.0102, -0.0236],
        [ 0.0198, -0.0234,  0.0299,  ...,  0.0042, -0.0383, -0.0255],
        ...,
        [-0.0298, -0.0197,  0.0297,  ..., -0.0387, -0.0517, -0.0057],
        [-0.0062,  0.0438,  0.0488,  ...,  0.0253,  0.0162, -0.0087],
        [-0.0015, -0.0242,  0.0047,  ..., -0.0479, -0.0592, -0.0355]],
       device='mps:0')
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0232, -0.0273,  0.0001],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0602, -0.0102, -0.0236],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0042, -0.0383, -0.0255],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ..., -0.0387, -0.0517, -0.0057],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0253,  0.0162, -0.0087],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.0479, -0.0592, -0.0355]],
       device='mps:0')


In [147]:
preds = []
true_labels = []

for batch in tqdm(test_loader, desc="Evaluating"):
    inputs, labels = batch
    inputs = {k: v.squeeze(1).to(device) for k, v in inputs.items()} 
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
    prediction = outputs.logits.argmax(dim=-1) + 1
    
    preds.extend(prediction.tolist())
    true_labels.extend(labels.tolist())
    

print(classification_report(true_labels, preds))

Evaluating:   2%|▏         | 16/938 [00:26<25:24,  1.65s/it]


KeyboardInterrupt: 

In [None]:
preds = []
true_labels = []

for i in range(len(test_df)):
    # 문장과 정답 레이블 가져오기
    sentence = str(test_df.iloc[i, 1]) + " " + str(test_df.iloc[i, 2]) + " " + str(test_df.iloc[i, 3])
    true_label = test_df.iloc[i, 0]
    
    # 문장을 모델의 입력 형식으로 변환
    inputs = tokenizer(sentence, truncation=True, max_length=512, padding='max_length', return_tensors="pt").to(device)

    # 모델에 입력값 넣기
    outputs = model(**inputs)

    # 모델의 예측 결과를 가져오기
    prediction = outputs.logits.argmax(dim=-1).item() + 1 # prediction은 0부터 시작하기 때문에 1을 더해줍니다.
    
    # 예측 결과와 정답 레이블을 리스트에 추가
    preds.append(prediction)
    true_labels.append(true_label)

    if (i+1) % 100 == 0:
        print(f"{i+1}th done.")

# 각 클래스별 정확도 계산 및 출력
print(classification_report(true_labels, preds))