In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import random
import torch.nn as nn
import os
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

In [None]:
def seed_everything(seed):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #设置torch模块的种子
    torch.manual_seed(seed)
    #设置cuda相关种子
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=42)

In [None]:
data_path = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(data_path + "prompts_train.csv")
prompts_test = pd.read_csv(data_path + "prompts_test.csv")
summaries_train = pd.read_csv(data_path + "summaries_train.csv")
summaries_test = pd.read_csv(data_path + "summaries_test.csv")
sample_submission = pd.read_csv(data_path + "sample_submission.csv")

train = summaries_train.merge(prompts_train, on="prompt_id")
test = summaries_test.merge(prompts_test, on="prompt_id")

train

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/debertav3base")

model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/debertav3base")

In [None]:
class CommonLitDataset(Dataset):
    def __init__(self,data:pd.DataFrame,tokenizer):
        super(CommonLitDataset,self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        
        self.text = self.data["text"].tolist()
        self.text = self.get_token(self.text)                
        
    def __getitem__(self,index):
        input_ids = self.text['input_ids'][index]
        attention_mask = self.text['attention_mask'][index]
        
        if 'content' not in self.data.columns:
            return {'input_ids':input_ids,
                   'attention_mask':attention_mask}
        else:
            content = self.data["content"].tolist()[index]
            wording = self.data["wording"].tolist()[index]

            return {'input_ids':input_ids,
                    'attention_mask':attention_mask,
                    'content':content,
                    'wording':wording}
            
            
    def __len__(self):
        return len(self.data['text'])
    
    def get_token(self,text):
        return self.tokenizer.batch_encode_plus(text,
                                         padding=True,
                                         truncation=True,
                                         max_length=512,
                                         return_tensors="pt")

In [None]:
batch_size = 12

target = ['content','wording']

data = train.loc[:,'text']
label = train.loc[:,target]

train_data,val_data,train_label,val_label = train_test_split(data,label,test_size=0.2,random_state=42)

train_data = pd.concat([train_data,train_label],axis=1)
val_data = pd.concat([val_data,val_label],axis=1)

train_dataset = CommonLitDataset(train_data,tokenizer)
train_loader = DataLoader(train_dataset,shuffle=False,batch_size=batch_size)
    
val_dataset = CommonLitDataset(val_data,tokenizer)
val_loader = DataLoader(val_dataset,shuffle=False,batch_size=batch_size)

In [None]:
class Deberta(nn.Module):
    def __init__(self,deberta):
        super(Deberta,self).__init__()
        self.deberta = deberta
        self.model = nn.Sequential(nn.Dropout(0.1),
                                   nn.Linear(2,768),
                                  nn.ReLU(),
                                  nn.Linear(768,256),
                                  nn.ReLU(),
                                  nn.Linear(256,2))
        
    def forward(self,input_ids,attention_mask):
        x = self.deberta(input_ids=input_ids,attention_mask=attention_mask)
        x = x[0].type(torch.float32)
        x =self.model(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Deberta(model).to(device)
optim = torch.optim.Adam(model.parameters(),lr=1.5e-5)
criterion = nn.MSELoss()

In [None]:
epochs = 30

model.train()

for epoch in range(epochs):
    running_loss = 0
    step = 0
    for data in train_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        content = data['content'].type(torch.float32).to(device)
        wording = data['wording'].type(torch.float32).to(device)

        optim.zero_grad()
        outputs = model(input_ids,attention_mask)
        loss = criterion(outputs[:,0],content) + criterion(outputs[:,1],wording)
        loss.backward()
        optim.step()
        if step % 500 == 0:
            print("Epoch {}, Step {}, Loss: {}".format(epoch+1, step, loss.item()))

        running_loss += loss.item()
        step = step + 1

    print(f"Epoch {epoch+1} Loss: {running_loss / len(train_loader)}")
        
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        step = 0
        for data in val_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            content = data['content'].type(torch.float32).to(device)
            wording = data['wording'].type(torch.float32).to(device)
            
            outputs = model(input_ids,attention_mask)
            val_loss+=criterion(outputs[:,0],content)+criterion(outputs[:,1],wording)
                
        print(f"Validation Loss: {val_loss / len(val_loader)}")
    model.train()

In [None]:
model.eval()
predict = []

test_dataset = CommonLitDataset(test,tokenizer)
test_loader = DataLoader(test_dataset,shuffle=False,batch_size=batch_size)

with torch.no_grad():
    for data in test_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        
        outputs = model(input_ids,attention_mask)
        predict.extend(outputs.cpu().numpy())

In [None]:
submission = pd.DataFrame({
    'student_id':test['student_id'],
    'content':[pred[0] for pred in predict],
    'wording':[pred[1] for pred in predict]
}) 
submission.to_csv('submission.csv',index=False)

In [None]:
submission

In [None]:
# Nuevo resumen que deseas predecir
nuevo_resumen = "Este es el nuevo resumen que deseas predecir."

# Tokeniza y codifica el nuevo resumen
nuevo_resumen_encoded = tokenizer.encode_plus(
    nuevo_resumen,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Realiza la predicción con el modelo
with torch.no_grad():
    input_ids = nuevo_resumen_encoded['input_ids'].to(device)
    attention_mask = nuevo_resumen_encoded['attention_mask'].to(device)
    
    outputs = model(input_ids, attention_mask)
    content_prediction = outputs[0][0].item()  # Predicción para "content"
    wording_prediction = outputs[0][1].item()  # Predicción para "wording"

# Imprime las predicciones
print(f'Predicción para "content": {content_prediction}')
print(f'Predicción para "wording": {wording_prediction}')

In [None]:
# Supongamos que "model" es tu modelo PyTorch entrenado
torch.save(model.state_dict(), '/kaggle/working/deberta_nlp.pth')

In [None]:
nuevo_resumen = "During the 1967 Third Wave experiment at Cubberley High School, students experienced a rapid transformation in their behavior and beliefs. The experiment aimed to demonstrate how easily people can be swayed to support authoritarian ideologies. Under the leadership of their teacher, Ron Jones, students adopted a strict code of conduct and authoritarian structure in a matter of days. They were highly engaged in this ""movement,"" even to the point of shutting out non-members and eagerly awaiting a televised announcement by a supposed presidential candidate from the movement. The experiment ended when Ron Jones felt it was spiraling out of control, illustrating the potential danger of such movements."
# Tokeniza y codifica el nuevo resumen
nuevo_resumen_encoded = tokenizer.encode_plus(
    nuevo_resumen,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
# Realiza la predicción con el modelo
with torch.no_grad():
    input_ids = nuevo_resumen_encoded['input_ids'].to(device)
    attention_mask = nuevo_resumen_encoded['attention_mask'].to(device)
    
    outputs = model(input_ids, attention_mask)
    content_prediction = outputs[0][0].item()  # Predicción para "content"
    wording_prediction = outputs[0][1].item()  # Predicción para "wording"

# Imprime las predicciones
print(f'Predicción para "content": {content_prediction}')
print(f'Predicción para "wording": {wording_prediction}')

In [None]:
print(predict)

In [62]:
import os
import torch

# Directorio donde se guardará el modelo personalizado
output_model_dir = "/kaggle/working/deberta_final"

# Asegúrate de que el directorio exista
os.makedirs(output_model_dir, exist_ok=True)

# Guarda los pesos del modelo en el nuevo directorio
torch.save(model.state_dict(), os.path.join(output_model_dir, "model_state_dict.pth"))

# Guarda otros archivos relacionados con el modelo (si los tienes)
# Por ejemplo, si tienes configuraciones personalizadas, guárdalas también.

# Opcional: guarda el tokenizador en el mismo directorio
tokenizer.save_pretrained(output_model_dir)


('/kaggle/working/deberta_final/tokenizer_config.json',
 '/kaggle/working/deberta_final/special_tokens_map.json',
 '/kaggle/working/deberta_final/spm.model',
 '/kaggle/working/deberta_final/added_tokens.json',
 '/kaggle/working/deberta_final/tokenizer.json')