# NLP Final COPA Project: DEBERTA

In [None]:
import os
import json
import torch
import transformers

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from DeBERTa import deberta
from datetime import datetime
from torch import nn, optim, cuda
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW, DebertaV2Config, DebertaV2Tokenizer, DebertaV2Model, DebertaV2ForMaskedLM, DebertaV2ForSequenceClassification


In [None]:
os.environ['NUMEXPR_MAX_THREADS'] = '16'
os.environ['NUMEXPR_NUM_THREADS'] = '12'

In [None]:
device = torch.device('cuda' if cuda.is_available() else 'cpu')

In [None]:
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v2-xlarge')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v2-xlarge')

In [None]:
def padding(token_ids):
    padding_token_ids = [0] * 20
    padding_attention_mask = [0] * 20
    
    padding_token_ids[:len(token_ids)] = token_ids
    padding_attention_mask[:len(token_ids)] = [1] * len(token_ids)
    
    return padding_token_ids, padding_attention_mask
    

In [None]:
def data_process(tokenizer, premise, choice1, choice2, question, label, padding):
    
    token_ids_0 = tokenizer(premise)['input_ids']
    token_ids_1 = tokenizer(choice1)['input_ids']
    token_ids_2 = tokenizer(choice2)['input_ids']

    padding_token_ids_0, padding_attention_mask_0 = padding(token_ids_0)
    padding_token_ids_1, padding_attention_mask_1 = padding(token_ids_1)
    padding_token_ids_2, padding_attention_mask_2 = padding(token_ids_2)

    input_ids_1 = tokenizer.build_inputs_with_special_tokens(padding_token_ids_0, padding_token_ids_1)
    input_ids_2 = tokenizer.build_inputs_with_special_tokens(padding_token_ids_0, padding_token_ids_2)
    input_ids = torch.tensor([input_ids_1, input_ids_2]).to(device)

    token_type_ids_1 = tokenizer.create_token_type_ids_from_sequences(padding_token_ids_0, padding_token_ids_1)
    token_type_ids_2 = tokenizer.create_token_type_ids_from_sequences(padding_token_ids_0, padding_token_ids_2)
    token_type_ids = torch.tensor([token_type_ids_1, token_type_ids_2]).to(device)

    attention_mask_1 = [1] + padding_attention_mask_0 + [1] + padding_attention_mask_1 + [1]
    attention_mask_2 = [1] + padding_attention_mask_0 + [1] + padding_attention_mask_2 + [1]
    attention_mask = torch.tensor([attention_mask_1, attention_mask_2]).to(device)

    if question == 'effect':
        if label == 1:
            labels = [0, 1]
        else:
            labels = [1, 0]
    elif question == 'cause':
        if label == 1:
            labels = [1, 0]
        else:
            labels = [0, 1]        
        
    labels = torch.tensor(labels).to(device)
    
    return input_ids, token_type_ids, attention_mask, labels


In [None]:
def training_model(filename, tokenizer, model, epochs, padding):
    model.to(device)
    model.train()
    
    optim = AdamW(model.parameters(), lr=5e-5)
    
    with open(filename, 'r', encoding='utf8') as file:
        raw_data = pd.read_json(file, lines=True)
    
    for i in range(epochs):
        print('Epochs:', i + 1)
        print('Training...')
        
        for index, row in raw_data.iterrows():
            optim.zero_grad()
            premise = row['premise']
            choice1 = row['choice1']
            choice2 = row['choice2']
            question = row['question']
            label = row['label']

            input_ids, token_type_ids, attention_mask, labels = data_process(tokenizer, premise, choice1, choice2, question, label, padding)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            loss = outputs['loss']
            loss.backward()
            optim.step()
               
        print('Finsh')


In [None]:
training_model("train.jsonl", tokenizer, model, 1, padding)

In [None]:
def evaluating_model(filename, tokenizer, model, padding):
    model.eval()
    
    total_num = 0
    correct_num = 0
    
    with open(filename, 'r', encoding='utf8') as file:
        raw_data = pd.read_json(file, lines=True)
    
    for index, row in raw_data.iterrows():
        total_num = total_num + 1
        
        premise = row["premise"]
        choice1 = row["choice1"]
        choice2 = row["choice2"]
        question = row["question"]
        label = row["label"]

        input_ids, token_type_ids, attention_mask, labels = data_process(tokenizer, premise, choice1, choice2, question, label, padding)
        output = model.forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        softmax = nn.Softmax(dim=1)
        logits = softmax(output["logits"]).tolist()

        score_0 = (logits[0][0] + logits[1][0]) / 2
        score_1 = (logits[0][1] + logits[1][1]) / 2
        if score_0 > score_1:
            result = [1, 0] 
        else: 
            result = [0, 1]

        if result == labels.tolist():
            correct_num = correct_num + 1
            
    accuarcy = correct_num / total_num
                
    return accuarcy


In [None]:
accuarcy = evaluating_model("test.jsonl", tokenizer, model, padding)

In [None]:
accuarcy

### Example

In [None]:
model.train()

token_ids_0 = tokenizer("The man turned on the faucet.")['input_ids']
token_ids_1 = tokenizer("The toilet filled with water.")['input_ids']
token_ids_2 = tokenizer("Water flowed from the spout.")['input_ids']

padding_token_ids_0, padding_attention_mask_0 = padding(token_ids_0)
padding_token_ids_1, padding_attention_mask_1 = padding(token_ids_1)
padding_token_ids_2, padding_attention_mask_2 = padding(token_ids_2)

input_ids_1 = tokenizer.build_inputs_with_special_tokens(padding_token_ids_0, padding_token_ids_1)
input_ids_2 = tokenizer.build_inputs_with_special_tokens(padding_token_ids_0, padding_token_ids_2)
input_ids = torch.tensor([input_ids_1, input_ids_2]).to(device)

token_type_ids_1 = tokenizer.create_token_type_ids_from_sequences(padding_token_ids_0, padding_token_ids_1)
token_type_ids_2 = tokenizer.create_token_type_ids_from_sequences(padding_token_ids_0, padding_token_ids_2)
token_type_ids = torch.tensor([token_type_ids_1, token_type_ids_2]).to(device)

attention_mask_1 = [1] + padding_attention_mask_0 + [1] + padding_attention_mask_1 + [1]
attention_mask_2 = [1] + padding_attention_mask_0 + [1] + padding_attention_mask_2 + [1]
attention_mask = torch.tensor([attention_mask_1, attention_mask_2]).to(device)

labels=torch.tensor([1, 0]).to(device)

outputs = model.forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

outputs

In [None]:
model.eval()

token_ids_0 = tokenizer("The runner wore shorts.")['input_ids']
token_ids_1 = tokenizer("The forecast predicted high temperatures.")['input_ids']
token_ids_2 = tokenizer("She planned to run along the beach.")['input_ids']
question = "cause"
label = 0

padding_token_ids_0, padding_attention_mask_0 = padding(token_ids_0)
padding_token_ids_1, padding_attention_mask_1 = padding(token_ids_1)
padding_token_ids_2, padding_attention_mask_2 = padding(token_ids_2)

input_ids_1 = tokenizer.build_inputs_with_special_tokens(padding_token_ids_0, padding_token_ids_1)
input_ids_2 = tokenizer.build_inputs_with_special_tokens(padding_token_ids_0, padding_token_ids_2)
input_ids = torch.tensor([input_ids_1, input_ids_2])

token_type_ids_1 = tokenizer.create_token_type_ids_from_sequences(padding_token_ids_0, padding_token_ids_1)
token_type_ids_2 = tokenizer.create_token_type_ids_from_sequences(padding_token_ids_0, padding_token_ids_2)
token_type_ids = torch.tensor([token_type_ids_1, token_type_ids_2])

attention_mask_1 = [1] + padding_attention_mask_0 + [1] + padding_attention_mask_1 + [1]
attention_mask_2 = [1] + padding_attention_mask_0 + [1] + padding_attention_mask_2 + [1]
attention_mask = torch.tensor([attention_mask_1, attention_mask_2])

#labels = torch.tensor([[1] * input_ids.size(1), [0] * input_ids.size(1)])
if question == "effect":
    if label == 1:
        labels = [0, 1]
    else:
        labels = [1, 0]
elif question == "cause":
    if label == 1:
        labels = [1, 0]
    else:
        labels = [0, 1]   

labels = torch.tensor(labels)

output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

softmax = nn.Softmax(dim=1)
logits = softmax(output["logits"]).tolist()

score_0 = (logits[0][0] + logits[1][0]) / 2
score_1 = (logits[0][1] + logits[1][1]) / 2
if score_0 > score_1:
    result = [1, 0] 
else: 
    result = [0, 1]
    
result