In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os
import gc
import sys
import copy
import time
import random
import string
import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from functools import partial
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold,StratifiedGroupKFold
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification
import warnings
from sklearn.metrics import log_loss
import torch.nn.functional as F
warnings.filterwarnings("ignore")
from datasets import Dataset, load_from_disk
import re
from text_unidecode import unidecode
from typing import Dict , List,Tuple
import codecs
from pathlib import Path
import pickle
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-large'
    seed = 42
    max_len = 1024
    dropout = 0.2
    target_size=3
    n_accumulate=1
    print_freq = 100
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    scheduler = 'CosineAnnealingLR'
    batch_size = 1
    num_workers = 8
    lr = 1e-5
    weight_decay = 0.01
    epochs = 4
    n_fold = 5
    trn_fold=[i for i in range(n_fold)]
    train = True 
    device = torch.device("cuda:0" )
    num_warmup_steps = 0
    num_cycles=0.5
    freezing = True
    debug = True
    T_max= 500,
    debug_ver2 = False
    gradient_checkpoint=False
    load_from_disk= None
OUTPUT_DIR = 'out'
model_path = 'microsoft/deberta-v3-large'


# In[2]:


INPUT_DIR = "train"


# In[3]:


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)    


# In[ ]:





# In[4]:


def replace_encoding_with_utf8(error:UnicodeError) -> Tuple[bytes,int]:
    return error.object[error.start:error.end].encode('utf-8'),error.end
def replace_decoding_with_cp1252(error:UnicodeError) -> Tuple[str,int]:
    return error.object[error.start:error.end].decode('cp1252'),error.end
codecs.register_error('replace_encoding_with_utf8',replace_encoding_with_utf8)
codecs.register_error('replace_decoding_with_cp1252',replace_decoding_with_cp1252)
def resolve_encodings_and_normalize(text:str) -> str:
    text = (text.encode('raw_unicode_escape')
            .decode('utf-8',errors = 'replace_decoding_with_cp1252')
            .encode('cp1252',errors = 'replace_encoding_with_utf8')
            .decode('utf-8',errors = 'replace_decoding_with_cp1252')
           )
    text = unidecode(text)
    return text


# In[5]:


def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    score = log_loss(labels,outputs)
    return round(score, 5)


# In[6]:


def get_essay(essay_id, is_train=True):
    parent_path = INPUT_DIR  if is_train else INPUT_DIR + 'test'
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

def criterion_val(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)
def criterion_train(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)


# In[7]:


df = pd.read_csv("train.csv")


# In[ ]:





# In[ ]:





# In[ ]:





# In[8]:



df['discourse_text'][293]='Cl' + df['discourse_text'][293]
df['discourse_text'][790]='T' + df['discourse_text'][790]
df['discourse_text'][879]='I' + df['discourse_text'][879]
df['discourse_text'][2828]='w' + df['discourse_text'][2828]
df['discourse_text'][4793]='i' + df['discourse_text'][4793]
df['discourse_text'][8093]='I' + df['discourse_text'][8093]
df['discourse_text'][9202]='l' + df['discourse_text'][9202]
df['discourse_text'][9790]='I' + df['discourse_text'][9790]
df['discourse_text'][14054]='i' + df['discourse_text'][14054]
df['discourse_text'][14387]='s' + df['discourse_text'][14387]
df['discourse_text'][15188]='i' + df['discourse_text'][15188]
df['discourse_text'][15678]='I' + df['discourse_text'][15678]
df['discourse_text'][16065]='f' + df['discourse_text'][16065]
df['discourse_text'][16084]='I' + df['discourse_text'][16084]
df['discourse_text'][16255]='T' + df['discourse_text'][16255]
df['discourse_text'][17096]='I' + df['discourse_text'][17096]
df['discourse_text'][17261]='t' + df['discourse_text'][17261]
df['discourse_text'][18691]='I' + df['discourse_text'][18691]
df['discourse_text'][19967]='t' + df['discourse_text'][19967]
df['discourse_text'][20186]='b' + df['discourse_text'][20186]
df['discourse_text'][20264]='I' + df['discourse_text'][20264]
df['discourse_text'][20421]='i' + df['discourse_text'][20421]
df['discourse_text'][20870]='h' + df['discourse_text'][20870]
df['discourse_text'][22064]='t' + df['discourse_text'][22064]
df['discourse_text'][22793]='I' + df['discourse_text'][22793]
df['discourse_text'][22962]='W' + df['discourse_text'][22962]
df['discourse_text'][23990]='f' + df['discourse_text'][23990]
df['discourse_text'][24085]='w' + df['discourse_text'][24085]
df['discourse_text'][25330]='a' + df['discourse_text'][25330]
df['discourse_text'][25446]='i' + df['discourse_text'][25446]
df['discourse_text'][25667]='S' + df['discourse_text'][25667]
df['discourse_text'][25869]='I' + df['discourse_text'][25869]
df['discourse_text'][26172]='i' + df['discourse_text'][26172]
df['discourse_text'][26284]='I' + df['discourse_text'][26284]
df['discourse_text'][26289]='t' + df['discourse_text'][26289]
df['discourse_text'][26322]='t' + df['discourse_text'][26322]
df['discourse_text'][26511]='t' + df['discourse_text'][26511]
df['discourse_text'][27763]='I' + df['discourse_text'][27763]
df['discourse_text'][28262]='P' + df['discourse_text'][28262]
df['discourse_text'][29164]='bu' + df['discourse_text'][29164]
df['discourse_text'][29519]='e' + df['discourse_text'][29519]
df['discourse_text'][29532]='t' + df['discourse_text'][29532]
df['discourse_text'][29571]='A' + df['discourse_text'][29571]
df['discourse_text'][29621]='t' + df['discourse_text'][29621]
df['discourse_text'][30791]='E' + df['discourse_text'][30791]
df['discourse_text'][30799]='T' + df['discourse_text'][30799]
df['discourse_text'][31519]='t' + df['discourse_text'][31519]
df['discourse_text'][31597]='t' + df['discourse_text'][31597]
df['discourse_text'][31992]='T' + df['discourse_text'][31992]
df['discourse_text'][32086]='I' + df['discourse_text'][32086]
df['discourse_text'][32204]='c' + df['discourse_text'][32204]
df['discourse_text'][32341]='becaus' + df['discourse_text'][32341]
df['discourse_text'][33246]='A' + df['discourse_text'][33246]
df['discourse_text'][33819]='W' + df['discourse_text'][33819]
df['discourse_text'][34023]='i' + df['discourse_text'][34023]
df['discourse_text'][35467]='b' + df['discourse_text'][35467]
df['discourse_text'][35902]='i' + df['discourse_text'][35902]  
df['essay_text'] = df['essay_id'].apply(get_essay)
df['essay_text'] = df['essay_text'].apply(lambda x : resolve_encodings_and_normalize(x))
df['discourse_text'] = df['discourse_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    
# 根据KFOLD划分数据
# gkf = GroupKFold(n_splits=CONFIG['n_fold'])

# for fold, (_, val_) in enumerate(gkf.split(X=df, groups=df.essay_id)):
#     df.loc[val_, "kfold"] = int(fold)

# df["kfold"] = df["kfold"].astype(int)
# df.groupby('kfold')['discourse_effectiveness'].value_counts()

# # 将 Ineffective Adequate Effective三个类别进行编码
# encoder = LabelEncoder()
# df['discourse_effectiveness'] = encoder.fit_transform(df['discourse_effectiveness'])


# In[9]:


def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


# In[10]:


codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


# In[11]:


def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


# In[12]:


def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)


# In[13]:


label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}
for i,item in enumerate(df['discourse_effectiveness']):
      df['discourse_effectiveness'][i] = label2id[df['discourse_effectiveness'][i]]


# In[14]:


# encoder = LabelEncoder()
# df['discourse_effectiveness'] = encoder.fit_transform(df['discourse_effectiveness'])


# In[15]:


def read_text_files(example, data_dir):

    id_ = example["essay_id"]

    with open(data_dir / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())
    return example

set_seed(CFG.seed)


# In[16]:


cfg={'load_from_disk':None} 


# In[17]:


data_dir = Path('train')
if cfg["load_from_disk"]:
    if not cfg["load_from_disk"].endswith(".dataset"):
        cfg["load_from_disk"] += ".dataset"
    ds = load_from_disk(cfg['load_from_disk'])
    
    pkl_file = f"{cfg['load_from_disk'][:-len('.dataset')]}_pkl"
    with open(pkl_file, "rb") as fp:
        grouped = pickle.load(fp)
    
    
    print("Loading from saved files")
else:
    train_df = df

    text_ds = Dataset.from_dict({"essay_id": df.essay_id.unique()})

    text_ds = text_ds.map(
        partial(read_text_files, data_dir=data_dir),
        num_proc=2,
        batched=False,
        desc="Loading text files",
    )


    text_df = text_ds.to_pandas()
    
    # train_df["discourse_text"] = [
    #     resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
    # ]

    train_df = train_df.merge(text_df, on="essay_id", how="left")
disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]
cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}


tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)
cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

#targets = ds['discourse_effectiveness']
#targets =  encoder.fit_transform(targets)


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[18]:


# def find_positions(example):

#     text = example["text"][0]
    
#     # keeps track of what has already
#     # been located
#     min_idx = 0
#     bad = []
#     # stores start and end indexes of discourse_texts
#     idxs = []
#     c = []
#     for dt in example["discourse_text"]:
#         # calling strip is essential
#         matches = list(re.finditer(re.escape(dt.strip()), text))
        
#         # If there are multiple matches, take the first one
#         # that is past the previous discourse texts.
#         if len(matches) > 1:
#             for m in matches:
#                 if m.start() >= min_idx:
#                     break
#         # If no matches are found
#         elif len(matches) == 0:
#             idxs.append([0,10])
#             continue  
#         # If one match is found
#         else:
#             m = matches[0]
            
#         idxs.append([m.start(), m.end()])

#         min_idx = m.start()
#     return idxs
    
# def tokenize(example):
#     example["idxs"] = find_positions(example)

#     text = example["text"][0]
#     chunks = []
#     labels = []
#     prev = 0

#     zipped = zip(
#         example["idxs"],
#         example["discourse_type"],
#         example["discourse_effectiveness"],
#     )
#     for idxs, disc_type, disc_effect in zipped:
#         # when the discourse_text wasn't found
#         if idxs == [-1]:
#             continue
#         s, e = idxs

#         # if the start of the current discourse_text is not 
#         # at the end of the previous one.
#         # (text in between discourse_texts)
#         if s != prev:
#             chunks.append(text[prev:s])
#             prev = s

#         # if the start of the current discourse_text is 
#         # the same as the end of the previous discourse_text
#         if s == prev:
#             chunks.append(cls_tokens_map[disc_type])
#             chunks.append(text[s:e])
#             chunks.append(end_tokens_map[disc_type])
        
#         prev = e

#         labels.append(label2id[disc_effect])

#     tokenized = tokenizer(
#         " ".join(chunks),
#         padding=False,
#         truncation=True,
#         max_length=CFG.max_len,
#         add_special_tokens=True,
#     )
    
#     # at this point, labels is not the same shape as input_ids.
#     # The following loop will add -100 so that the loss function
#     # ignores all tokens except CLS tokens

#     # idx for labels list
#     idx = 0
#     final_labels = []
#     for id_ in tokenized["input_ids"]:
#         # if this id belongs to a CLS token
#         if id_ in cls_id_map.values():
#             final_labels.append(1)
#             idx += 1
#         else:
#             # -100 will be ignored by loss function
#             final_labels.append(-100)
    
#     tokenized["labels"] = final_labels

#     return tokenized


# In[ ]:





# In[19]:



def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([0,10]) # will filter out later
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)

    text = example["text"][0]
    chunks = []
    labels = []
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs
       
        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])
        
        prev = e

        #labels.append(label2id[disc_effect])

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=CFG.max_len,
        add_special_tokens=True,
    )
    
    # at this point, labels is not the same shape as input_ids.
    # The following loop will add -100 so that the loss function
    # ignores all tokens except CLS tokens

    # idx for labels list
    idx = 0
    final_labels = []
    for id_ in tokenized["input_ids"]:
        # if this id belongs to a CLS token
        if id_ in cls_id_map.values():
            final_labels.append(1)
            idx += 1
        else:
            # -100 will be ignored by loss function
            final_labels.append(-100)
    
    tokenized["labels"] = final_labels

    return tokenized


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[20]:



#I frequently restart my notebook, so to reduce time
# you can set this to just load the tokenized dataset from disk.
# It gets loaded in the 3rd code cell, but a check is done here
# to skip tokenizing
if cfg["load_from_disk"] is None:

    # make lists of discourse_text, discourse_effectiveness
    # for each essay
    grouped = train_df.groupby(["essay_id"],sort = False).agg(list)
    
    ds = Dataset.from_pandas(grouped)
    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=1,
        desc="Tokenizing",
    )
   
    save_dir = f"{OUTPUT_DIR}"
    #ds.to_csv('out.csv')
    ds.save_to_disk(f"{save_dir}.dataset")
    #ds.to_csv('out.csv')
    with open(f"{save_dir}_pkl", "wb") as fp:
        pickle.dump(grouped, fp)
    print("Saving dataset to disk:", OUTPUT_DIR)
#encoder = LabelEncoder()
#df['discourse_effectiveness'] = encoder.fit_transform(df['discourse_effectiveness'])


# In[21]:



# In[ ]:





# In[ ]:





# In[23]:


# gkf = GroupKFold(n_splits=cfg.n_fold)

# for fold, (_, val_) in enumerate(gkf.split(X=df, groups=df.essay_id)):
#     df.loc[val_, "kfold"] = int(fold)

# df["kfold"] = df["kfold"].astype(int)
# df.groupby('kfold')['discourse_effectiveness'].value_counts()

# # 将 Ineffective Adequate Effective三个类别进行编码
# encoder = LabelEncoder()
# df['discourse_effectiveness'] = encoder.fit_transform(df['discourse_effectiveness'])


# In[24]:


from itertools import chain


# In[ ]:





# In[25]:


# def get_folds(ds, k_folds=5):
    
#     return [
#         val_idx
#         for _, val_idx in  enumerate(gkf.split(X=ds, groups=ds['labels']))
#     ]

# fold_idxs = get_folds(ds, 5)


# In[26]:


# #ds = pd.read_csv('out.csv')

# gkf = GroupKFold(n_splits=5)
# for fold, (_, val_) in enumerate(gkf.split(X=ds, groups=ds['labels'])):
#     ds.loc[val_, "kfold"] = int(fold)

# ds["kfold"] = ds["kfold"].astype(int)

# #print(ds)


# In[27]:



# def prepare_loaders(fold):
#     df_train = df[df.kfold != fold].reset_index(drop=True)
#     df_valid = df[df.kfold == fold].reset_index(drop=True)

#     train_dataset = FeedBackDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
#     valid_dataset = FeedBackDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

#     train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], collate_fn=collate_fn,
#                               num_workers=8, shuffle=True, pin_memory=False, drop_last=True)
#     valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], collate_fn=collate_fn,
#                               num_workers=8, shuffle=False, pin_memory=False)

#     return train_loader, valid_loader


# In[28]:


# for fold in range(5):
#     keep_cols = {"input_ids", "attention_mask", "labels",'discourse_effectiveness'}
#     #train_idxs =  list(chain(*[i for f, i in enumerate(fold_idxs) if f != fold]))
#     train_dataset = ds.drop([c for c in ds.columns if c not in keep_cols])
#     eval_dataset = ds.drop([c for c in ds.columns if c not in keep_cols])
# # train_dataset.to_csv('1.csv')


# In[ ]:





# In[29]:



# def prepare_loaders(train_dataset,eval_dataset):
#     train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size,collate_fn = collator,
#                               num_workers=8, shuffle=True, pin_memory=False, drop_last=True)
#     valid_loader = DataLoader(eval_dataset, batch_size=CFG.batch_size, collate_fn = collator,
#                              num_workers=8, shuffle=False, pin_memory=False)

#     return train_loader, valid_loader


# In[30]:


# for fold, (_, val_) in enumerate(gkf.split(X=df, groups=df.essay_id)):
#     df.loc[val_, "kfold"] = int(fold)


# In[31]:


# def get_folds(df, k_folds=5):

#     sgkf = GroupKFold(n_splits=k_folds)
#     return [
#         val_idx
#         for _, val_idx in sgkf.split(X=df, groups=df['essay_id'])
#     ]

# fold_idxs = get_folds(ds, 5)


# In[ ]:





# In[32]:


# targets = data['labels'].to(device, dtype=torch.long)
# batch_size = data['labels'][i].size(0)
# drop = nn.Dropout(p=0.2)
# output = model(data)
#         pooler = WeightedLayerPooling(24,layer_start = 4,layer_weights=None)
#         fc = nn.Linear(model_config.hidden_size, 3)
#         fc.to(device)
#         hidden_states  = output.hidden_states
#         all_hidden_states = torch.stack(hidden_states).cuda()
#         all_hidden_states.cuda()
#         out = pooler(all_hidden_states)
#         out = out[:, 0].cuda()
#         out = drop(out).cuda()
#         outputs = fc(out).cuda()
#         #outputs(data)
#         loss = criterion(outputs, targets)
#         loss = loss / CONFIG['n_accumulate']
#         loss.backward()
 


# In[33]:


class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None             else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        all_layer_embedding.to(CFG.device)
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size()).to(CFG.device)
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        weighted_average.to(CFG.device)
        return weighted_average


# In[34]:


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


# In[35]:


from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification,BertForTokenClassification,DebertaV2ForTokenClassification


# In[36]:


# collator = DataCollatorForTokenClassification(
#     tokenizer=tokenizer, pad_to_multiple_of=16, padding=True
# )

# output = args.output_dir
# for fold in range(CFG.n_fold):
    
#     args.output_dir = f"{output}-fold{fold}"
    
#     model_config = AutoConfig.from_pretrained(
#             cfg["model_name_or_path"],
#         )
#     model_config.update(
#         {
#             "num_labels": 3,
#             "cls_tokens": list(cls_id_map.values()),
#             "label2id": label2id,
#             "id2label": {v:k for k, v in label2id.items()},
#         }
#     )
    
#     model = AutoModelForTokenClassification.from_pretrained(cfg["model_name_or_path"], config=model_config)
    
#     # Because tokens were added, it is important to resize the embeddings
#     model.resize_token_embeddings(len(tokenizer)) 

#     # split dataset to train and eval
#     keep_cols = {"input_ids", "attention_mask", "labels"}
#     train_idxs =  list(chain(*[i for f, i in enumerate(fold_idxs) if f != fold]))
#     train_dataset = ds.select(train_idxs).remove_columns([c for c in ds.column_names if c not in keep_cols])
#     eval_dataset = ds.select(fold_idxs[fold]).remove_columns([c for c in ds.column_names if c not in keep_cols])


# In[37]:




class FeedBackModel(nn.Module):
    def __init__(self,model_name):
        super(FeedBackModel, self).__init__()
        self.model_config = AutoConfig.from_pretrained(
            "microsoft/deberta-v3-large"
        )
        self.model_config.update(
        {
            #"num_labels": 3,
            "cls_tokens": list(cls_id_map.values()),
            #"label2id": label2id,
            #"id2label": {v:k for k, v in label2id.items()},
            'output_hidden_states':True,
        }
        )
        self.mo = DebertaV2ForTokenClassification.from_pretrained(model_name,config = self.model_config)
        # mo.to(CFG.device)
        #self.model.resize_token_embeddings(len(tokenizer)) 
        self.drop = nn.Dropout(p=0.2)
        #self.pooler = WeightedLayerPooling(24,layer_start = 4,layer_weights=None)
        self.fc = nn.Linear(self.model_config.hidden_size, 3)
        self.fc.to(CFG.device)
    def forward(self,ids,mask):
        output = self.mo(input_ids=ids,attention_mask=mask)
        pooler = WeightedLayerPooling(24,layer_start = 4,layer_weights=None)
        # fc = nn.Linear(self.model_config.hidden_size, 3)
        hidden_states  = output.hidden_states
        all_hidden_states = torch.stack(hidden_states).cuda()
        all_hidden_states.cuda()
        out = pooler(all_hidden_states)
        #out = out[:, 0].cuda()
        #out = self.drop(out).cuda()
        #outputs = self.fc(out)
        #hidden_states  = model(inputs).hidden_states
        # hidden_1 = hidden_states[-1]
        # hidden_2 = hidden_states[-2]
        # hidden_3 = hidden_states[-3]
        # hidden_4 = hidden_states[-4]
        # all_ = torch.cat( hidden_1,hidden_2,hidden_3,hidden_4)
        #out = out.view(-1, model_config.hidden_size)
        outputs = self.fc(out)
        return outputs


# In[ ]:





# In[38]:


# model = FeedBackModel(model_path)
# model.train()
# outputs = model


# In[39]:


# args={'do_train': True,
#         "do_eval": True,
#         "per_device_train_batch_size": 2,
#         "per_device_eval_batch_size": 4,
#         "learning_rate": 9e-6,
#         "weight_decay": 0.01,
#         "num_train_epochs": 3,
#         "warmup_ratio": 0.1,
#         "optim": 'adamw_torch',
#         "logging_steps": 50,
#         "save_strategy": "epoch",
#         "evaluation_strategy": "epoch",
#         "report_to": "none",
#         "group_by_length": True,
#         "save_total_limit": 1,
#         "metric_for_best_model": "loss",
#         "greater_is_better": False,
#         "seed": 18}


# In[40]:

class AWP:
    def __init__(
            self,
            model,
            optimizer,
            adv_param="weight",
            adv_lr=1,
            adv_eps=0.2,
            start_epoch=0,
            adv_step=1,
            scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, x, y, attention_mask, epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save()
        for i in range(self.adv_step):
            self._attack_step()
            with torch.cuda.amp.autocast():
                adv_loss, tr_logits = self.model(input_ids=x, attention_mask=attention_mask, labels=y)

            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()

        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                # 保存原始参数
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self, ):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}



import gc
import torch
from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification

# args = args
# # If using longformer, you will want to pad to a multiple of 512
# # For most others, you'll want to pad to a multiple of 8
# collator = DataCollatorForTokenClassification(
#     tokenizer=tokenizer, pad_to_multiple_of=cfg["pad_multiple"], padding=True
# )

# output = '/out/'
# for fold in range(cfg["k_folds"]):

#     trainer = Trainer(
#         model=model,
#         args=args,
#         train_dataset=train_dataset,
#         eval_dataset=eval_dataset,
#         tokenizer=tokenizer,
#         data_collator=collator,
#     )
    
#     trainer.train()
#     optimizer = AdamW(model.parameters(), lr=CFG.lr, eps=CFG.eps, betas=CFG.betas)
#     num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
#     scheduler = get_scheduler(CFG, optimizer, num_train_steps)
#     del model
#     gc.collect()
#     torch.cuda.empty_cache()


# In[41]:


# trainer = Trainer(
#         model=FeedBackModel,
#         args=args,
#         train_dataset=train_dataset,
#         eval_dataset=eval_dataset,
#         tokenizer=tokenizer,
#         data_collator=collator,
#     )


# In[42]:


# class FeedBackDataset(Dataset):
#     def __init__(self, ds):
#         self.ds = ds
#         self.input_ids = ds['input_ids'].values
#         self.attention_mask = ds['attention_mask'].values
#         self.labels = ds['labels'].values
#         self.discourse_effectiveness = ds['discourse_effectiveness'].values

#     def __len__(self):
#         return len(self.ds)

#     def __getitem__(self, index):
#         input_ids = self.input_ids[index]
#         attention_mask = self.attention_mask[index]
#         labels = self.labels[index]
#         discourse_effectiveness = self.discourse_effectiveness[index]
        
#         return {
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'discourse_effectiveness':discourse_effectiveness,
#             'labels' :labels
#         }


# In[ ]:





# In[43]:


def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)
    #return nn.MSELoss()(outputs, labels)

# 训练过程函数

def train_one_epoch(swa_start,model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    swa_model = AveragedModel(model)
    dataset_size = 0
    running_loss = 0.0
    pos = []
    count = []
    loss = 0.0
    sum_loss = []
    retain = []
    a = 0
    swa_scheduler =SWALR(optimizer,swa_lr=1e-6)
    start_epoch = int(len(dataloader) / CFG.batch_size * CFG.epochs)
    awp = AWP(model,
              optimizer,
              adv_lr=0.0000,
              adv_eps=0.001,
              start_epoch=start_epoch,
              scaler=None
              )
    idx = 0
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        idx += 1
        ids = data['input_ids'].to(device, dtype=torch.long)
        #ids = data['input_ids']
        mask = data['attention_mask'].to(device, dtype=torch.long)
        label  = data['labels'].to(device, dtype=torch.long)
        targets = data['discourse_effectiveness'].to(device, dtype=torch.long)
        for i,item in enumerate(label[0]):
            if item.item() == 1:
                pos.append(i)  
                #count.append(i)
            elif item.item() == -100:
                 continue
        #print(pos)
        out = model(ids, mask)
        #out = fc(out)
        #sen_size = targets.size(0)
        batch_size = ids.size(0)
        #print(targets)
        #print(out.size())
        for i,item in enumerate(pos):
            a = out[0][item]
            a = a.unsqueeze(0)
            #print(a.size())
           # b = out[0][i+1]
            tar = targets[0][i].unsqueeze(0)
            #b=torch.unsqueeze(b,0)
            lo =criterion(a,tar)
            #loss = lo+retain
            retain.append(lo)
            #lo2=criterion(b, targets[0][int((2))].unsqueeze(0)) 
        #loss = (lo1+lo2)/2
        loss = sum(retain)
        loss = loss/len(pos)
        pos = []
        retain = []
        #loss = loss / len(targets[0])
        #print(loss)
        loss.backward()
        if loss < 0.66:
            awp.attack_backward(ids, label, mask, idx)
        if (step + 1) % CFG.n_accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()
            if epoch > swa_start:
                swa_model.update_parameters(model)
                swa_scheduler.step() 
            # zero the parameter gradients
            else :
                if scheduler is not None:
                    scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    torch.optim.swa_utils.update_bn(train_loader, swa_model)
    gc.collect()

    return epoch_loss,swa_model


# def train_one_epoch_swa(model, optimizer, scheduler, dataloader, device, epoch):
#     model.train()
#     dataset_size = 0
#     running_loss = 0.0
#     pos = []
#     count = []
#     loss = 0.0
#     sum_loss = []
#     retain = []
#     a = 0
#     start_epoch = int(len(dataloader) / CFG.batch_size * CFG.epochs)
#     awp = AWP(model,
#               optimizer,
#               adv_lr=0.0000,
#               adv_eps=0.001,
#               start_epoch=start_epoch,
#               scaler=None
#               )
#     idx = 0
#     bar = tqdm(enumerate(dataloader), total=len(dataloader))
#     for step, data in bar:
#         idx += 1
#         ids = data['input_ids'].to(device, dtype=torch.long)
#         #ids = data['input_ids']
#         mask = data['attention_mask'].to(device, dtype=torch.long)
#         label  = data['labels'].to(device, dtype=torch.long)
#         targets = data['discourse_effectiveness'].to(device, dtype=torch.long)
#         for i,item in enumerate(label[0]):
#             if item.item() == 1:
#                 pos.append(i)
#                 #count.append(i)
#             elif item.item() == -100:
#                  continue
#         #print(pos)
#         out = model(ids, mask)
#         #out = fc(out)
#         #sen_size = targets.size(0)
#         batch_size = ids.size(0)
#         #print(targets)
#         #print(out.size())
#         for i,item in enumerate(pos):
#             a = out[0][item]
#             a = a.unsqueeze(0)
#             #print(a.size())
#            # b = out[0][i+1]
#             tar = targets[0][i].unsqueeze(0)
#             #b=torch.unsqueeze(b,0)
#             lo =criterion(a,tar)
#             #loss = lo+retain
#             retain.append(lo)
#             #lo2=criterion(b, targets[0][int((2))].unsqueeze(0))
#         #loss = (lo1+lo2)/2
#         loss = sum(retain)
#         loss = loss/len(pos)
#         pos = []
#         retain = []
#         #loss = loss / len(targets[0])
#         #print(loss)
#         loss.backward()
#         if loss < 0.7:
#             awp.attack_backward(ids, label, mask, idx)
#         if (step + 1) % CFG.n_accumulate == 0:
#             optimizer.step()

#             # zero the parameter gradients
#             optimizer.zero_grad()

#             if scheduler is not None:
#                 scheduler.step()
#         running_loss += (loss.item() * batch_size)
#         dataset_size += batch_size

#         epoch_loss = running_loss / dataset_size

#         bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
#                         LR=optimizer.param_groups[0]['lr'])
#     gc.collect()

#     return epoch_loss


# 验证过程函数
# 验证过程函数

@torch.no_grad()
# def valid_one_epoch(swa_start,model, dataloader, device, epoch,swa_model):
#     if epoch <= swa_start:
#         model = model
#     elif epoch> swa_start:
#         model = swa_model
#     model.eval()
#     pos = []
#     count = []
#     dataset_size = 0
#     running_loss = 0.0
#     retain = []
#     bar = tqdm(enumerate(dataloader), total=len(dataloader))
#     for step, data in bar:
#         ids = data['input_ids'].to(device, dtype=torch.long)
#         mask = data['attention_mask'].to(device, dtype=torch.long)
#         targets = data['discourse_effectiveness'].to(device, dtype=torch.long)
#         label  = data['labels'].to(device, dtype=torch.float)
#         for i,item in enumerate(label[0]):

#             if item.item() == 1:
#                 pos.append(i)
#             elif item.item() == -100:
#                  continue
#         #print(pos) 
#         out = model(ids, mask)
#         #out = fc(out)
#         #sen_size = targets.size(0)
#         batch_size = ids.size(0)    
#         #print(targets)
#         #print(out.size())
#         for i,item in enumerate(pos):
#             a = out[0][item]
#             a= a.unsqueeze(0)
#             #print(a.size())
#            # b = out[0][i+1]
#             tar = targets[0][i].unsqueeze(0)
#             #b=torch.unsqueeze(b,0)
#             lo =criterion(a,tar)
#             retain.append(lo)
            
#             #lo2=criterion(b, targets[0][int((2))].unsqueeze(0)) 
#         #loss = (lo1+lo2)/2
#         loss = sum(retain)
#         loss = loss/len(pos)
#         pos = []
#         retain = []
#             #loss_pre.append(loss_)
#             #loss_pr = torch.tensor(loss_pre)
#         # for i in range(len(data['discourse_effectiveness'][0])):
#         #         ou = nn.Softmax(dim = 0)
#         #         k = ou(out[0][i]).unsqueeze(0)
#         #         a.append(k)    
#         #loss = torch.mean(loss_pr)
#         # targets = targets.view(-1)
#         # # print(len(data['discourse_effectiveness'][0]))
#         # loss = torch.tensor(loss, dtype=float)
#         #outputs2 = model(ids,mask)

#         #loss = criterion(outputs, targets.squeeze())

#         running_loss += (loss.item() * batch_size)
#         dataset_size += batch_size

#         epoch_loss = running_loss / dataset_size

#         bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
#                         LR=optimizer.param_groups[0]['lr'])

#     gc.collect()

#     return epoch_loss

from torch.optim.swa_utils import SWALR,AveragedModel

def fetch_scheduler(optimizer):
    if CFG.scheduler == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=500,
                                                   eta_min=CFG.min_lr)
    elif CFG.scheduler == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0,
                                                             eta_min=CFG.min_lr)
    elif CFG.scheduler == None:
        return None

    return scheduler

# 开始训练函数
def run_training(model, optimizer,device, num_epochs, fold):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    #num_train_steps =int(len(train_loader)/CFG.batch_size*CFG.epochs)
    #scheduler = get_scheduler(CFG.optimizer, num_train_steps)
    swa_start = 2
    for epoch in range(1, num_epochs + 1):
        gc.collect()
#         if epoch == int(4):
#             swa_train(fold)

#         else:
        scheduler = fetch_scheduler(optimizer)
        train_epoch_loss,swa_model = train_one_epoch(swa_start,model, optimizer, scheduler,
                                               dataloader=train_loader,
                                               device=device, epoch=epoch)
        history['Train Loss'].append(train_epoch_loss)

        val_epoch_loss = valid_one_epoch(swa_start,model, valid_loader, device=device,
                                         epoch=epoch,swa_model=swa_model)
     
        history['Valid Loss'].append(val_epoch_loss)
        best_model_wts = copy.deepcopy(model.state_dict())
        PATH = f"/root/autodl-tmp/Loss-Fold-{fold}.bin"
        torch.save(model.state_dict(), PATH)
        print(f"Model Saved")
        # deep copy the model
        # if val_epoch_loss <= best_epoch_loss:
        #     print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
        #     best_epoch_loss = val_epoch_loss
        #     # run.summary["Best Loss"] = best_epoch_loss
        #     best_model_wts = copy.deepcopy(model.state_dict())
        #     PATH = f"/root/autodl-tmp/Loss-Fold-{fold}.bin"
        #     torch.save(model.state_dict(), PATH)
        #     print(f"Model Saved")

        print()

    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history

# 通过KFOLD加载数据集

# def swa_train(fold):
#     best_epoch_loss = np.inf
#     num_epochs_swa = 1
#     history = defaultdict(list)
#     for epoch in range(1, num_epochs_swa + 1):
#             model = FeedBackModel('microsoft/deberta-v3-large').to(CFG.device)# init your model class, build the graph shape
#             state_dict = torch.load(f"/root/autodl-tmp/Loss-Fold-{fold}.bin")
#             model.load_state_dict(state_dict)
#             swa_model = AveragedModel(model).to(CFG.device)
#             #model = AveragedModel(model,device=device)
#             swa_scheduler = SWALR(optimizer, swa_lr=3e-6)
#             epoch_loss = train_one_epoch_swa(swa_model, optimizer, swa_scheduler,
#                                                dataloader=train_loader,
#                                                device=CFG.device, epoch=epoch)
#             history['Train Loss'].append(epoch_loss)
#             swa_model.update_parameters(swa_model)
#             swa_scheduler.step()
#             torch.optim.swa_utils.update_bn(train_loader, swa_model, device=CFG.device)
#             val_epoch_loss = valid_one_epoch(swa_model, valid_loader, device=CFG.device,
#                                         epoch=epoch)       
#             if val_epoch_loss <= best_epoch_loss:
#                 print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
#                 best_epoch_loss = val_epoch_loss
#                 torch.save(swa_model.state_dict(), f"/root/autodl-tmp/last-{fold}.pt")
def prepare_loaders(fold):
    # train = ds[ds.kfold != fold].reset_index(drop=True)
    # valid = ds[ds.kfold == fold].reset_index(drop=True)
    keep_cols = {"input_ids", "attention_mask", "labels",'discourse_effectiveness'}
    train_idxs =  list(chain(*[i for f, i in enumerate(fold_idxs) if f != fold]))    
    collator = DataCollatorForTokenClassification(
     tokenizer=tokenizer, pad_to_multiple_of=8, padding=True
 )
    train_dataset = ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
    # eval_dataset = ds.select(fold_idxs[fold]).remove_columns([c for c in ds.column_names if c not in keep_cols])
    train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size,collate_fn = collator,
                              num_workers=8, shuffle=True, pin_memory=False, drop_last=True)
    # valid_loader = DataLoader(eval_dataset, batch_size=CFG.batch_size,collate_fn = collator,
    #                           num_workers=8, shuffle=False, pin_memory=False)
    # total_loader = DataLoader(ds,batch_size=CFG.batch_size,collate_fn = collator,
    #                           num_workers=8, shuffle=False, pin_memory=False)
    return train_loader


# Run！

for fold in range(0, 5):
    print(f"====== Fold: {fold} ======")

    # Create Dataloaders
    train_loader, valid_loader ,total_loader= prepare_loaders(4)

    Fmodel = FeedBackModel(model_path)
    Fmodel.to(CFG.device)
    ignored_params = list(map(id, Fmodel.fc.parameters()))
    #awl = UncertaintyLoss(2)
    base = filter(lambda p: id(p) not in ignored_params, Fmodel.parameters())
    optimizer = AdamW([{'params': base}, {'params': Fmodel.fc.parameters(), 'lr': 3e-5}], lr=CFG.lr,weight_decay=CFG.weight_decay)

    Fmodel, history = run_training(Fmodel, optimizer,
                                  device=CFG.device,
                                  num_epochs=CFG.epochs,
                                  fold=fold)

    del Fmodel, history, train_loader, valid_loader
    _ = gc.collect()
    print()

    

Loading text files #0:   0%|          | 0/2096 [00:00<?, ?ex/s]

Loading text files #1:   0%|          | 0/2095 [00:00<?, ?ex/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizing:   0%|          | 0/4191 [00:00<?, ?ex/s]

Saving dataset to disk: out


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForTokenClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a B

[INFO] Using GPU: NVIDIA A40



100%|██████████| 3353/3353 [15:09<00:00,  3.69it/s, Epoch=1, LR=2.79e-6, Train_Loss=0.714]
100%|██████████| 838/838 [01:21<00:00, 10.30it/s, Epoch=1, LR=2.79e-6, Valid_Loss=0.624]


Validation Loss Improved (inf ---> 0.6239318072669172)
Model Saved



100%|██████████| 3353/3353 [15:06<00:00,  3.70it/s, Epoch=2, LR=2.79e-6, Train_Loss=0.599]
100%|██████████| 838/838 [01:25<00:00,  9.85it/s, Epoch=2, LR=2.79e-6, Valid_Loss=0.605]


Validation Loss Improved (0.6239318072669172 ---> 0.6046331555567024)
Model Saved



100%|██████████| 3353/3353 [18:44<00:00,  2.98it/s, Epoch=3, LR=1e-6, Train_Loss=0.476]
100%|██████████| 838/838 [01:30<00:00,  9.23it/s, Epoch=3, LR=1e-6, Valid_Loss=0.597]


Validation Loss Improved (0.6046331555567024 ---> 0.5972174988500552)
Model Saved



100%|██████████| 3353/3353 [22:19<00:00,  2.50it/s, Epoch=4, LR=1e-6, Train_Loss=0.417]
100%|██████████| 838/838 [01:36<00:00,  8.65it/s, Epoch=4, LR=1e-6, Valid_Loss=0.615]



Training complete in 1h 17m 41s
Best Loss: 0.5972



Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForTokenClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a B

[INFO] Using GPU: NVIDIA A40



100%|██████████| 3353/3353 [17:49<00:00,  3.13it/s, Epoch=1, LR=2.79e-6, Train_Loss=0.715]
100%|██████████| 838/838 [01:25<00:00,  9.77it/s, Epoch=1, LR=2.79e-6, Valid_Loss=0.735]


Validation Loss Improved (inf ---> 0.7349020663064266)
Model Saved



100%|██████████| 3353/3353 [15:27<00:00,  3.61it/s, Epoch=2, LR=2.79e-6, Train_Loss=0.631]
100%|██████████| 838/838 [01:28<00:00,  9.48it/s, Epoch=2, LR=2.79e-6, Valid_Loss=0.642]


Validation Loss Improved (0.7349020663064266 ---> 0.6415135208725694)
Model Saved



 97%|█████████▋| 3265/3353 [18:42<00:31,  2.77it/s, Epoch=3, LR=1e-6, Train_Loss=0.521]

In [None]:
! pip install text_unidecode

In [None]:
swa_train()

In [None]:
# best_epoch_loss = np.inf
# num_epochs_swa = 2
# history = defaultdict(list)
# for epoch in range(1, num_epochs_swa + 1):
#             #state = torch.load("Loss-Fold-1.bin")
#             #model.load_state_dict(state)
#             model = FeedBackModel('microsoft/deberta-v3-large').to(CFG.device)# init your model class, build the graph shape
#             state_dict = torch.load("Loss-Fold-1.bin")
#             model.load_state_dict(state_dict)
#             swa_model = AveragedModel(model).to(CFG.device)
#             #model = AveragedModel(model,device=device)
#             swa_scheduler = SWALR(optimizer, swa_lr=3e-6)
#             epoch_loss = train_one_epoch_swa(swa_model, optimizer, swa_scheduler,
#                                                dataloader=train_loader,
#                                                device=CFG.device, epoch=epoch)
#             history['Train Loss'].append(epoch_loss)
#             swa_model.update_parameters(swa_model)
#             swa_scheduler.step()
#             torch.optim.swa_utils.update_bn(train_loader, swa_model, device=CFG.device)
#             if epoch_loss <= best_epoch_loss:
#                 print(f"Validation Loss Improved ({best_epoch_loss} ---> {epoch_loss})")
#                 best_epoch_loss = epoch_loss
#             # run.summary["Best Loss"] = best_epoch_loss
#                 torch.save(swa_model.state_dict(), "last.pt")

In [None]:
import pandas as pd
a = pd.read_csv('out.csv')
a.to_csv('/root/autodl-tmp/1.csv')