In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.8 MB/s[0m eta [36m0:00:0

In [3]:
import pandas as pd
import numpy as np
import json
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from collections import OrderedDict
import torch.nn.functional as F
from  torch.utils.data import DataLoader ,TensorDataset,Dataset
from sklearn.metrics import  accuracy_score ,f1_score ,recall_score
from tqdm import tqdm
import os

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path="drive/MyDrive/NLP_2023/HW3"
checkpoint_path="drive/MyDrive/NLP_2023/HW3/checkpoints"

In [6]:
os.makedirs(checkpoint_path,exist_ok=True)

In [7]:
train_df=pd.read_json(f"{path}/data/train.jsonl", lines=True)
test_df=pd.read_json(f"{path}/data/test.jsonl", lines=True)

In [8]:
#1
def read_map_data(json_file):
    with open(json_file) as json_file:
        file = json.load(json_file)
        dict_map=OrderedDict(file)
        return dict_map


relation=read_map_data(f"{path}/data/relations2id.json")
num_classes=len(relation)


In [9]:
#2
def prepare(df):
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    df["subject_start_idx"]=df.relations.apply(lambda x: x[0]["subject"]["start_idx"] )
    df["subject_end_idx"]=df.relations.apply(lambda x: x[0]["subject"]["end_idx"] )
    df["subject_entity_type"]=df.relations.apply(lambda x: x[0]["subject"]["entity_type"] )
    df["subject_text"]=df.relations.apply(lambda x: x[0]["subject"]["text"] )

    df["object_start_idx"]=df.relations.apply(lambda x: x[0]["object"]["start_idx"] )
    df["object_end_idx"]=df.relations.apply(lambda x: x[0]["object"]["end_idx"] )
    df["object_entity_type"]=df.relations.apply(lambda x: x[0]["object"]["entity_type"] )
    df["object_text"]=df.relations.apply(lambda x: x[0]["object"]["text"] )

    df["relation"]=df.relations.apply(lambda x: x[0]["relation"])
    df["lbl"]=df["relation"].apply(lambda x : relation[x])
    df["text"]=df["tokens"].apply(lambda x :tokenizer.convert_tokens_to_string(x))
    return df




train_df=prepare(train_df)
test_df=prepare(test_df)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
#3
mx_len=max(train_df.tokens.apply(lambda x :len(x)).max(),test_df.tokens.apply(lambda x :len(x)).max())
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

class dataset(Dataset):
    def __init__(self,dataframe):
        super().__init__()
        self.df=dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self,index):
        tkn,lbl,ss,se,os,oe=self.df.iloc[index][["tokens","lbl","subject_start_idx","subject_end_idx",
                                                 "object_start_idx","object_end_idx"]]
        lbl1=[0]*len(tkn)+(mx_len-len(tkn))*[100]
        lbl1=np.array(lbl1)
        lbl1[ss:se]=1
        lbl1[os:oe]=1
        lbl1=torch.tensor(lbl1).long()
        lbl=torch.tensor(lbl).long()
        tkn=tokenizer.convert_tokens_to_ids(tkn)
        tkn=tkn+(mx_len-len(tkn))*[0]
        tkn=torch.tensor(tkn,requires_grad=False)
        return tkn,lbl,lbl1




In [11]:
#4
class relation_model(nn.Module):
    def __init__(self,output_size):
        super(relation_model,self).__init__()
        self.bert=BertModel.from_pretrained("bert-base-uncased")
        self.fc1=nn.Linear(768,256)
        self.fc2=nn.Linear(256,output_size)
        self.drop=nn.Dropout(0.3)

        self.fc_clh1=nn.Linear(768,256)
        self.fc_clh2=nn.Linear(256,2)


    def forward(self,inputs):
        outputs=self.bert(inputs)
        clh_output=outputs[0]
        pooler_output=outputs[-1]

        clh_output=self.drop(clh_output)
        clh_output=F.relu(self.fc_clh1(clh_output))
        clh_output=torch.softmax(self.fc_clh2(clh_output),2)
        clh_output=clh_output.permute(0,2,1)

        pooler_output=self.drop(pooler_output)
        pooler_output=F.relu(self.fc1(pooler_output))
        pooler_output=torch.softmax(self.fc2(pooler_output),1)


        return(pooler_output,clh_output)

In [12]:
#5
test_dataset=dataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=True)
train_dataset=dataset(train_df)
train_loader=DataLoader(train_dataset,batch_size=32)

In [None]:

#6
model=relation_model(num_classes)
criterion=nn.CrossEntropyLoss()
criterion1=nn.CrossEntropyLoss(ignore_index=100)
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)

n_epochs=10
for epoch in range(n_epochs):
    for step,(tkns,lbls,lbls1) in enumerate(train_loader):
        pred,pred1=model(tkns)
        loss=criterion(pred,lbls)
        loss1=criterion1(pred1,lbls1)
        t_loss=loss+loss1
        t_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        r_pred=torch.argmax(pred,1).reshape(-1)


        acc=accuracy_score(lbls,r_pred)
        f1=f1_score(lbls,r_pred,average='weighted')
        print (f"epoch: {epoch} , step: {step} , loss : {loss.item()} , accuracy: {round(acc,2)} , f1_score: {f1}")
    torch.save(model,f"./{checkpoint_path}/model{epoch}.pth")




Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 0 , step: 0 , loss : 3.220263719558716 , accuracy: 0.03 , f1_score: 0.004166666666666667
epoch: 0 , step: 1 , loss : 3.212775945663452 , accuracy: 0.25 , f1_score: 0.25
epoch: 0 , step: 2 , loss : 3.17401123046875 , accuracy: 0.5 , f1_score: 0.3333333333333333
epoch: 0 , step: 3 , loss : 3.085477590560913 , accuracy: 0.56 , f1_score: 0.40499999999999997
epoch: 0 , step: 4 , loss : 2.884129047393799 , accuracy: 0.53 , f1_score: 0.36862244897959184
epoch: 0 , step: 5 , loss : 2.885345697402954 , accuracy: 0.41 , f1_score: 0.2347222222222222
epoch: 0 , step: 6 , loss : 2.942272663116455 , accuracy: 0.34 , f1_score: 0.17587209302325582
epoch: 0 , step: 7 , loss : 2.848005771636963 , accuracy: 0.44 , f1_score: 0.266304347826087
epoch: 0 , step: 8 , loss : 2.941617250442505 , accuracy: 0.34 , f1_score: 0.17587209302325582
epoch: 0 , step: 9 , loss : 2.7228586673736572 , accuracy: 0.56 , f1_score: 0.40499999999999997
epoch: 0 , step: 10 , loss : 2.7228503227233887 , accuracy: 0.56 , f1

In [None]:
torch.save(model,f"./{checkpoint_path}/model{epoch}.pth")


In [None]:
#7
t_pred=np.array([])
t_lbl=np.array([])
for test_input,test_lbl,_ in tqdm(test_loader):

    test_pred,_=model(test_input)
    test_pred=torch.argmax(test_pred,1)


    t_pred=np.hstack((t_pred,test_pred.detach().numpy()))
    t_lbl=np.hstack((t_lbl,test_lbl.detach().numpy()))



test_acc=accuracy_score(t_lbl,t_pred)
test_recall=recall_score(t_lbl,t_pred,average='weighted')
test_f1=f1_score(t_lbl,t_pred,average='weighted')


In [None]:
#8
print(f"test accuracy: {test_acc}")
print(f"test recall: {test_recall}")
print(f"test f1_score: {test_f1}")