In [None]:
#https://www.kaggle.com/competitions/feedback-prize-effectiveness
#https://www.kaggle.com/datasets/hiromoon166/deberta-v3-large
#https://www.kaggle.com/datasets/kpriyanshu256/debertalarge
#训练完成权重

### 句子分类
deberta_v3_large_v3.py训练保存的权重

In [1]:
%%writefile deberta_v3.py
# import manipulation
import numpy as np
import pandas as pd

# import Pytorch
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.utils.checkpoint import checkpoint
from torch.autograd import Variable

# import Transformer model
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout, ContextPooler

# import SKLearn
from sklearn.model_selection import  KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss


# import ...
import string
import random
import os
import joblib
import gc
import copy
import time


# other
from tqdm import tqdm
from collections import defaultdict

os.environ["TOKENIZERS_PARALLELISM"] = "false"
transformers.logging.set_verbosity_error()
class CFG:
    seed = 2022
    max_length = 512
    epoch = 4
    train_batch_size = 8
    valid_batch_size = 8
    model_name = "../input/deberta-v3-large/deberta-v3-large"
    token_name = "../input/deberta-v3-large/deberta-v3-large"
    num_classes = 3
    n_fold = 5
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.token_name, use_fast=True)
CFG.tokenizer.model_max_length = CFG.max_length
CFG.tokenizer.is_fast
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
class FeedbackDataset(Dataset):
    def __init__(self,df, max_length, tokenizer, training=True):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.discourse_type = self.df['discourse_type'].values
        self.discourse_text = self.df['discourse_text'].values
        self.essays = self.df['essay_text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        discourse_type = self.discourse_type[index]
        discourse_text = self.discourse_text[index]
        essay = self.essays[index]
        type_text = discourse_type + ' ' + discourse_text
        
        inputs = self.tokenizer.encode_plus(
            type_text, 
            essay,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = True,
            max_length = self.max_len
        )
        
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return samples


# Dynamic Padding (Collate)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

# collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)


class FeedbackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedbackModel, self).__init__()
        
        # DeBERTa
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({"output_hidden_states":True, 
                      'return_dict':True}) 
        self.backbone = AutoModel.from_pretrained(model_name, config=self.config)
        self.fc = nn.Linear(self.config.hidden_size, CFG.num_classes)

    def forward(self, ids, mask):        
        out = self.backbone(input_ids=ids,attention_mask=mask)
        hs = out['hidden_states']
        x = hs[-1][:, 0, :]
        x = self.fc(x)

        return x

test_df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
test_df.head()

INPUT_DIR = "/kaggle/input/feedback-prize-effectiveness"
TEST_DIR = os.path.join(INPUT_DIR, "test")
TEST_CSV = os.path.join(INPUT_DIR, "test.csv")

def get_essay_test(essay_id):
    path = os.path.join(TEST_DIR, f'{essay_id}.txt')
    essay_text = open(path, 'r').read()
    return essay_text

test_df = pd.read_csv(TEST_CSV)
test_df['essay_text']= test_df['essay_id'].apply(get_essay_test)
test_df['text'] = test_df['essay_text']+' '+test_df['discourse_text']
test_df['length'] = test_df['text'].map(len)
test_df = test_df.sort_values(['length']).reset_index(drop=True)
del test_df['length'], test_df['text']
gc.collect()
collate_fn = Collate(tokenizer=CFG.tokenizer, isTrain=False)



def prepare_test_loader(test_df):    
    test_dataset = FeedbackDataset(test_df, 
                                   tokenizer=CFG.tokenizer, 
                                   max_length=CFG.max_length,
                                    training=False)
    
    test_loader = DataLoader(test_dataset, 
                             batch_size=CFG.valid_batch_size, 
                             collate_fn=collate_fn, 
                             num_workers=2, 
                             shuffle=False, 
                             pin_memory=True, 
                             drop_last=False)
    return test_loader

test_loader = prepare_test_loader(test_df)

@torch.no_grad()
def inference(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    
    for step, data in bar: 
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        
        output = model(ids, mask)
        y_preds = softmax(output.to('cpu').numpy())
        
        preds.append(y_preds)
         
    predictions = np.concatenate(preds)
    return predictions

deberta_predictions = []

for fold in range(CFG.n_fold):
    print("Fold {}".format(fold))

    model = FeedbackModel(CFG.model_name)
    state = torch.load(f'../input/feedback-prize-effectiveness-v2-models/deberta-V3-large-v3_f{fold}.bin')

    model.load_state_dict(state)

    prediction = inference(test_loader, model, CFG.device)
    deberta_predictions.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

predictions = np.mean(deberta_predictions, axis=0)
predictions

INPUT_DIR = "../input/feedback-prize-effectiveness/"
submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
submission['discourse_id'] = test_df['discourse_id']
submission['Ineffective'] = predictions[:, 0]
submission['Adequate'] = predictions[:, 1]
submission['Effective'] = predictions[:, 2]

submission

submission.to_csv('sub1.csv', index=False)


In [2]:
!python deberta_v3.py

### 句子分类
deberta_large_v3训练保存的权重

In [3]:
%%writefile deberta.py
# import manipulation
import numpy as np
import pandas as pd

# import Pytorch
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.utils.checkpoint import checkpoint
from torch.autograd import Variable

# import Transformer model
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout, ContextPooler

# import SKLearn
from sklearn.model_selection import  KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss


# import ...
import string
import random
import os
import joblib
import gc
import copy
import time


# other
from tqdm import tqdm
from collections import defaultdict

os.environ["TOKENIZERS_PARALLELISM"] = "false"
transformers.logging.set_verbosity_error()
class CFG:
    seed = 2022
    max_length = 512
    epoch = 4
    train_batch_size = 8
    valid_batch_size = 8
    model_name = "../input/debertalarge"
    token_name = "../input/debertalarge"
    num_classes = 3
    n_fold = 5
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.token_name, use_fast=True)
CFG.tokenizer.model_max_length = CFG.max_length
CFG.tokenizer.is_fast
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
class FeedbackDataset(Dataset):
    def __init__(self,df, max_length, tokenizer, training=True):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.discourse_type = self.df['discourse_type'].values
        self.discourse_text = self.df['discourse_text'].values
        self.essays = self.df['essay_text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        discourse_type = self.discourse_type[index]
        discourse_text = self.discourse_text[index]
        essay = self.essays[index]
        type_text = discourse_type + ' ' + discourse_text
        
        inputs = self.tokenizer.encode_plus(
            type_text, 
            essay,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = True,
            max_length = self.max_len
        )
        
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return samples


# Dynamic Padding (Collate)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

# collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)


class FeedbackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedbackModel, self).__init__()
        
        # DeBERTa
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({"output_hidden_states":True, 
                      'return_dict':True}) 
        self.deberta = AutoModel.from_pretrained(model_name, config=self.config)
        self.fc = nn.Linear(self.config.hidden_size, CFG.num_classes)

    def forward(self, ids, mask):        
        out = self.deberta(input_ids=ids,attention_mask=mask)
        hs = out['hidden_states']
        x = hs[-1][:, 0, :]
        x = self.fc(x)

        return x

test_df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
test_df.head()

INPUT_DIR = "/kaggle/input/feedback-prize-effectiveness"
TEST_DIR = os.path.join(INPUT_DIR, "test")
TEST_CSV = os.path.join(INPUT_DIR, "test.csv")

def get_essay_test(essay_id):
    path = os.path.join(TEST_DIR, f'{essay_id}.txt')
    essay_text = open(path, 'r').read()
    return essay_text

test_df = pd.read_csv(TEST_CSV)
test_df['essay_text']= test_df['essay_id'].apply(get_essay_test)
test_df['text'] = test_df['essay_text']+' '+test_df['discourse_text']
test_df['length'] = test_df['text'].map(len)
test_df = test_df.sort_values(['length']).reset_index(drop=True)
del test_df['length'], test_df['text']
gc.collect()
collate_fn = Collate(tokenizer=CFG.tokenizer, isTrain=False)

def prepare_test_loader(test_df):    
    test_dataset = FeedbackDataset(test_df, 
                                   tokenizer=CFG.tokenizer, 
                                   max_length=CFG.max_length,
                                    training=False)
    
    test_loader = DataLoader(test_dataset, 
                             batch_size=CFG.valid_batch_size, 
                             collate_fn=collate_fn, 
                             num_workers=2, 
                             shuffle=False, 
                             pin_memory=True, 
                             drop_last=False)
    return test_loader

test_loader = prepare_test_loader(test_df)

@torch.no_grad()
def inference(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    
    for step, data in bar: 
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        
        output = model(ids, mask)
        y_preds = softmax(output.to('cpu').numpy())
        
        preds.append(y_preds)
         
    predictions = np.concatenate(preds)
    return predictions

deberta_predictions = []

for fold in range(CFG.n_fold):
    print("Fold {}".format(fold))

    model = FeedbackModel(CFG.model_name)
    state = torch.load(f'../input/feedback-prize-effectiveness-v2-models/deberta-large-v3_f{fold}.bin')

    model.load_state_dict(state)

    prediction = inference(test_loader, model, CFG.device)
    deberta_predictions.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

predictions = np.mean(deberta_predictions, axis=0)
predictions

INPUT_DIR = "../input/feedback-prize-effectiveness/"
submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
submission['discourse_id'] = test_df['discourse_id']
submission['Ineffective'] = predictions[:, 0]
submission['Adequate'] = predictions[:, 1]
submission['Effective'] = predictions[:, 2]

submission

submission.to_csv('sub2.csv', index=False)



In [4]:
!python deberta.py

### token分类
模型配置文件和权重文件都是由deberta_large_train_on_oldweights.py训练生成保存的

In [5]:
%%writefile token1.py
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import gc
import random
import warnings
import torch
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoTokenizer, AutoConfig
from itertools import chain
from text_unidecode import unidecode
from typing import Tuple
import codecs
import re
from functools import partial
import datasets


warnings.filterwarnings("ignore")
gc.collect()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIR = "../input/feedback-prize-effectiveness/"
model_path = '../input/feedback-deberta-large-token-cls-bs4/'

class CFG:
    model = "deberta-large"
    max_len = 2048
    batch_size = 2
    epochs = 4
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4,]
    lr = 1e-5
    weight_decay = 1e-2

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def get_essay_text(sample, data_dir):
    id_ = sample["essay_id"]
    with open(data_dir + "test/" + f"{id_}.txt", "r") as fp:
        sample["essay_text"] = resolve_encodings_and_normalize(fp.read())
    return sample

tokenizer = AutoTokenizer.from_pretrained(model_path)

disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]
cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

cls_id_map = {
    label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
}

id_cls_map = {v: k for k, v in cls_id_map.items()}

def find_positions(sample):
    text = sample["essay_text"][0]

    # keeps track of what has already
    # been located
    min_idx = 0

    # stores start and end indexes of discourse_texts
    idxs = []

    for dt in sample["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))

        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1])  # will filter out later
            continue
            # If one match is found
        else:
            m = matches[0]

        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(sample):
    sample["idxs"] = find_positions(sample)

    text = sample["essay_text"][0]
    chunks = []
    prev = 0

    zipped = zip(
        sample["idxs"],
        sample["discourse_type"],
    )
    for idxs, disc_type in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])

        prev = e

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=CFG.max_len,
        add_special_tokens=True,
    )

    return tokenized

test_df = pd.read_csv(INPUT_DIR + "test.csv")

essay_text_ds = datasets.Dataset.from_dict({"essay_id": test_df.essay_id.unique()})
essay_text_ds = essay_text_ds.map(
        partial(get_essay_text, data_dir=INPUT_DIR),
        num_proc=1,
        batched=False,
        desc="Loading text files",
)
essay_text_df = essay_text_ds.to_pandas()

test_df["discourse_text"] = [resolve_encodings_and_normalize(x) for x in test_df["discourse_text"]]
test_df = test_df.merge(essay_text_df, on="essay_id", how="left")
del essay_text_df

# track the matchings of each discourse in its essay, by the order given in the csv file

discourse_text_values = test_df['discourse_text'].values
essay_text_values = test_df['essay_text'].values

matches = []
for i, dt in enumerate(discourse_text_values):
    if dt.strip() in essay_text_values[i]:
        matches.append(1)
    else:
        matches.append(0)
test_df['match'] = matches

grouped_df = test_df.groupby(["essay_id"]).agg(list)

ds = datasets.Dataset.from_pandas(grouped_df)
ds = ds.map(
        tokenize,
        batched=False,
        num_proc=1,
        desc="Tokenizing",
)

bad_matches = []
cls_ids = set(list(cls_id_map.values()))
for id_, ids, dt in zip(ds["essay_id"], ds["input_ids"], ds["discourse_id"]):
    # count number of cls ids
    num_cls_id = sum([x in cls_ids for x in ids])
    # true number of discourse_texts
    num_dt = len(dt)

    if num_cls_id != num_dt:
        bad_matches.append((id_, ids, dt))

print("Num bad matches:", len(bad_matches))
print()

collator_fn = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=8, padding=True
)

model_config = AutoConfig.from_pretrained(model_path)

preds = []

for fold in range(CFG.n_fold):
    if fold in CFG.trn_fold:
        print(f"fold = {fold}")
        
        checkpoint = f'../input/feedback-deberta-large-token-cls-bs4/pytorch_model_f{fold}.bin'
        model = AutoModelForTokenClassification.from_pretrained(checkpoint, config=model_config)
        
        keep_cols = {"input_ids", "attention_mask"}
        test_dataset = ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
        
        trainer = Trainer(
            model=model,
            tokenizer=tokenizer,
            data_collator=collator_fn,
        )
        
        pred = trainer.predict(test_dataset)
        preds.append(pred[0])
        
        del model
        gc.collect()
        torch.cuda.empty_cache()

# get predicted logits

preds = np.array(preds)
logits = np.exp(preds) / np.expand_dims(np.sum(np.exp(preds), axis=-1), axis=-1)

model_preds = np.mean(logits, axis=0)

# extract the predicted labels for all [discourse_type_CLS] 

head_preds = []
for i, sample in enumerate(model_preds):
    sample_pred = []
    sample_ids = ds['input_ids'][i]
    for j, tk_id in enumerate(sample_ids):
        if tk_id in cls_ids:
            sample_pred.append(sample[j])
    head_preds.append(sample_pred)

# collect predictions of each discourse by the order given in the csv file

essay_id_map = {v : k for k, v in enumerate(ds['essay_id'])}

final_preds = []

ordered_essay_ids = test_df['essay_id'].values
disordered_essay_matches = grouped_df['match'].values

pre_essay_id = ''
for essay_id in ordered_essay_ids:
    if essay_id == pre_essay_id:
        continue
    pre_essay_id = essay_id
    essay_pred = head_preds[essay_id_map[essay_id]]
    essay_macth = disordered_essay_matches[essay_id_map[essay_id]]
    for i, discourse_match in enumerate(essay_macth):
        if discourse_match == 1 and i < len(essay_pred):
            final_preds.append(essay_pred[i])
        else:
            final_preds.append([0., 0., 0.])
            
final_preds = np.array(final_preds)

preds_Ineffective = final_preds[:, 2]
preds_Adequate = final_preds[:, 0]
preds_Effective = final_preds[:, 1]

sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

sample['Ineffective'] = preds_Ineffective
sample['Adequate'] = preds_Adequate
sample['Effective'] = preds_Effective


sample.to_csv('sub3.csv', index=False)

In [7]:
!python token1.py

### token分类
模型配置文件和权重文件都是由deberta_v3_large_train_on_oldweights.py训练生成保存的

In [6]:
%%writefile token2.py
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import gc
import random
import warnings
import torch
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoTokenizer, AutoConfig
from itertools import chain
from text_unidecode import unidecode
from typing import Tuple
import codecs
import re
from functools import partial
import datasets


warnings.filterwarnings("ignore")
gc.collect()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIR = "../input/feedback-prize-effectiveness/"
model_path = '../input/feedback-debertav3large-token-cls-models/'

class CFG:
    model = "deberta-large"
    max_len = 2048
    batch_size = 2
    epochs = 4
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4,]
    lr = 1e-5
    weight_decay = 1e-2

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def get_essay_text(sample, data_dir):
    id_ = sample["essay_id"]
    with open(data_dir + "test/" + f"{id_}.txt", "r") as fp:
        sample["essay_text"] = resolve_encodings_and_normalize(fp.read())
    return sample

tokenizer = AutoTokenizer.from_pretrained(model_path)

disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]
cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

cls_id_map = {
    label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
}

id_cls_map = {v: k for k, v in cls_id_map.items()}

def find_positions(sample):
    text = sample["essay_text"][0]

    # keeps track of what has already
    # been located
    min_idx = 0

    # stores start and end indexes of discourse_texts
    idxs = []

    for dt in sample["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))

        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1])  # will filter out later
            continue
            # If one match is found
        else:
            m = matches[0]

        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(sample):
    sample["idxs"] = find_positions(sample)

    text = sample["essay_text"][0]
    chunks = []
    prev = 0

    zipped = zip(
        sample["idxs"],
        sample["discourse_type"],
    )
    for idxs, disc_type in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])

        prev = e

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=CFG.max_len,
        add_special_tokens=True,
    )

    return tokenized

test_df = pd.read_csv(INPUT_DIR + "test.csv")

essay_text_ds = datasets.Dataset.from_dict({"essay_id": test_df.essay_id.unique()})
essay_text_ds = essay_text_ds.map(
        partial(get_essay_text, data_dir=INPUT_DIR),
        num_proc=1,
        batched=False,
        desc="Loading text files",
)
essay_text_df = essay_text_ds.to_pandas()

test_df["discourse_text"] = [resolve_encodings_and_normalize(x) for x in test_df["discourse_text"]]
test_df = test_df.merge(essay_text_df, on="essay_id", how="left")
del essay_text_df

# track the matchings of each discourse in its essay, by the order given in the csv file

discourse_text_values = test_df['discourse_text'].values
essay_text_values = test_df['essay_text'].values

matches = []
for i, dt in enumerate(discourse_text_values):
    if dt.strip() in essay_text_values[i]:
        matches.append(1)
    else:
        matches.append(0)
test_df['match'] = matches

grouped_df = test_df.groupby(["essay_id"]).agg(list)

ds = datasets.Dataset.from_pandas(grouped_df)
ds = ds.map(
        tokenize,
        batched=False,
        num_proc=1,
        desc="Tokenizing",
)

bad_matches = []
cls_ids = set(list(cls_id_map.values()))
for id_, ids, dt in zip(ds["essay_id"], ds["input_ids"], ds["discourse_id"]):
    # count number of cls ids
    num_cls_id = sum([x in cls_ids for x in ids])
    # true number of discourse_texts
    num_dt = len(dt)

    if num_cls_id != num_dt:
        bad_matches.append((id_, ids, dt))

print("Num bad matches:", len(bad_matches))
print()

collator_fn = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=8, padding=True
)

model_config = AutoConfig.from_pretrained(model_path)

preds = []

for fold in range(CFG.n_fold):
    if fold in CFG.trn_fold:
        print(f"fold = {fold}")
        
        checkpoint = f'../input/feedback-debertav3large-token-cls-models/pytorch_model_2048_fold{fold}.bin'
        model = AutoModelForTokenClassification.from_pretrained(checkpoint, config=model_config)
        
        keep_cols = {"input_ids", "attention_mask"}
        test_dataset = ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
        
        trainer = Trainer(
            model=model,
            tokenizer=tokenizer,
            data_collator=collator_fn,
        )
        
        pred = trainer.predict(test_dataset)
        preds.append(pred[0])
        
        del model
        gc.collect()
        torch.cuda.empty_cache()

# get predicted logits

preds = np.array(preds)
logits = np.exp(preds) / np.expand_dims(np.sum(np.exp(preds), axis=-1), axis=-1)

model_preds = np.mean(logits, axis=0)

# extract the predicted labels for all [discourse_type_CLS] 

head_preds = []
for i, sample in enumerate(model_preds):
    sample_pred = []
    sample_ids = ds['input_ids'][i]
    for j, tk_id in enumerate(sample_ids):
        if tk_id in cls_ids:
            sample_pred.append(sample[j])
    head_preds.append(sample_pred)

# collect predictions of each discourse by the order given in the csv file

essay_id_map = {v : k for k, v in enumerate(ds['essay_id'])}

final_preds = []

ordered_essay_ids = test_df['essay_id'].values
disordered_essay_matches = grouped_df['match'].values

pre_essay_id = ''
for essay_id in ordered_essay_ids:
    if essay_id == pre_essay_id:
        continue
    pre_essay_id = essay_id
    essay_pred = head_preds[essay_id_map[essay_id]]
    essay_macth = disordered_essay_matches[essay_id_map[essay_id]]
    for i, discourse_match in enumerate(essay_macth):
        if discourse_match == 1 and i < len(essay_pred):
            final_preds.append(essay_pred[i])
        else:
            final_preds.append([0., 0., 0.])
            
final_preds = np.array(final_preds)

preds_Ineffective = final_preds[:, 2]
preds_Adequate = final_preds[:, 0]
preds_Effective = final_preds[:, 1]

sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

sample['Ineffective'] = preds_Ineffective
sample['Adequate'] = preds_Adequate
sample['Effective'] = preds_Effective


sample.to_csv('sub4.csv', index=False)



In [8]:
!python token2.py

### 集成

In [None]:
import pandas as pd
sub1 = pd.read_csv('sub1.csv').sort_values(['discourse_id']).reset_index(drop=True)
sub2 = pd.read_csv('sub2.csv').sort_values(['discourse_id']).reset_index(drop=True)
sub3 = pd.read_csv('sub3.csv').sort_values(['discourse_id']).reset_index(drop=True)
sub4 = pd.read_csv('sub4.csv').sort_values(['discourse_id']).reset_index(drop=True)
sub1['Ineffective'] = sub1['Ineffective']*0.25 + sub2['Ineffective']*0.25 + sub3['Ineffective']*0.25 + sub4['Ineffective']*0.25

sub1['Adequate'] = sub1['Adequate']*0.25 + sub2['Adequate']*0.2 + sub3['Adequate']*0.3 + sub4['Adequate']*0.25

sub1['Effective'] = sub1['Effective']*0.25 + sub2['Effective']*0.2 + sub3['Effective']*0.3 + sub4['Effective']*0.25
sub1.to_csv('submission.csv', index=False)