<a href="https://www.kaggle.com/code/jvthunder/pytorch-simple-baseline-bert-model?scriptVersionId=143061923" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
df_prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
df_summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
df_prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
df_summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

In [None]:
from functools import reduce
df = reduce(lambda left,right: pd.merge(left,right,on=['prompt_id'], how='outer'), 
            [df_prompts_train, df_summaries_train])
df_test = reduce(lambda left,right: pd.merge(left,right,on=['prompt_id'], how='outer'), 
            [df_prompts_test, df_summaries_test])

In [None]:
df.head()

In [None]:
# df["final_text"] = df["prompt_question"] + " [SEP] " + df["text"]
# df.head()["final_text"][0]

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
shuffled = df.sample(frac=1, random_state=42)
train_size = int(0.9 * len(shuffled))
df_train = shuffled[:train_size].reset_index().drop(columns=["index"])
df_valid = shuffled[train_size:].reset_index().drop(columns=["index"])

In [None]:
df_train.head()

In [None]:
df_valid.head()

In [None]:
print(len(df_train))
print(len(df_valid))

In [None]:
df_test.head()

In [None]:
import transformers
from transformers import BertModel, BertTokenizer
MODEL_DIR = '/kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)

In [None]:
# Create Dataset
from torch.utils.data import Dataset

class CommonLitDataset(Dataset):
        
    def get_tokens(self, texts):
        return self.tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            max_length=512,
            padding='max_length',
        )
    
    def preprocess(self):
        self.df["final_text"] = self.df["prompt_title"].str.lower() + " [SEP] " + self.df["prompt_question"].str.lower() + " [SEP] " + self.df["text"].str.lower()
        self.tokens = self.get_tokens(self.df["final_text"])
    
    def precompute(self, df):
        return get_tokens(df["text"])
    
    def __init__(self, df, tokenizer, mode="train"):
        self.mode = mode
        self.df = df
        self.tokenizer = tokenizer
        self.preprocess()

    def __len__(self):
        return len(self.df)

    def __getitem__(self ,idx):
        row = self.df.loc[idx]
        input_token = np.array(self.tokens["input_ids"][idx])
        mask = np.array(self.tokens["attention_mask"][idx])
        if self.mode == "train":
            content = row["content"]
            wording = row["wording"]
            target = np.array([content, wording]).astype(np.float32)
            return input_token, mask, target
        elif self.mode == "test":
            student_id = row["student_id"]
            return input_token, mask, student_id

In [None]:
train_set = CommonLitDataset(df_train, tokenizer, mode="train")
valid_set = CommonLitDataset(df_valid, tokenizer, mode="train")
test_set = CommonLitDataset(df_test, tokenizer, mode="test")

In [None]:
print(f"Size of Train set : {len(train_set)}")
print(f"Size of Valid set : {len(valid_set)}")
print(f"Size of Test set : {len(test_set)}")

In [None]:
from torch.utils.data import DataLoader
BATCH_SIZE = 16
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, shuffle = True)
valid_loader = DataLoader(valid_set, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_set, batch_size = BATCH_SIZE, shuffle = False)

In [None]:
print(f"Total no. of batches in train_loader: {len(train_loader)}")
print(f"Total no. of batches in valid_loader: {len(valid_loader)}")
print(f"Total no. of batches in train_loader: {len(test_loader)}")

In [None]:
# Check dataset
for input_token, mask, target in train_loader:
    break
print(f'train input_token: {input_token.shape}')
print(f'train mask: {mask.shape}')
print(f'train target: {target.shape}')

for input_token, mask, student_id in test_loader:
    break
print(f'test input_token: {input_token.shape}')
print(f'test mask: {mask.shape}')
print(f'test student_id: {student_id}')

In [None]:
if torch.cuda.is_available(): device = "cuda"
else: device = "cpu"
print(device)

In [None]:
# Make the transformer model
import torch.nn as nn

class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_DIR)
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 128)
        self.linear2 = nn.Linear(128, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embedding = outputs.last_hidden_state
        x = torch.mean(embedding, dim=1)
        x = self.dropout(x)
        x = self.linear1(x)
        x = nn.ReLU()(x)
        x = self.linear2(x)
        return x

model = BERTModel().to(device)

In [None]:
# Training
from tqdm import tqdm 

def train(model, optimizer, loss_fn, loader):
    model.train()
    total_loss = 0
    for input_token, mask, target in tqdm(loader):
        input_token = input_token.to(device)
        mask = mask.to(device)
        target = target.to(device)

        preds = model(input_token, mask)
        loss = loss_fn(preds, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.detach().item()
    
    return total_loss / len(loader)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
mse_loss = nn.MSELoss()
EPOCHS = 40
for i in range(EPOCHS):
    train_loss = train(model, optimizer, mse_loss, train_loader)
    print(f"Epoch {i+1} MSE loss: {train_loss}")

In [None]:
# Validation
from tqdm import tqdm 

def validate(model, loss_fn, loader):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for input_token, mask, target in tqdm(loader):
            input_token = input_token.to(device)
            mask = mask.to(device)
            target = target.to(device)

            preds = model(input_token, mask)
            loss = loss_fn(preds, target)

            total_loss += loss.detach().item()

        return total_loss / len(loader)

In [None]:
valid_loss = validate(model, mse_loss, valid_loader)
print(f"Valid loss: {valid_loss}")

In [None]:
student_scores = {}
with torch.no_grad():
    for input_token, mask, student_id in test_loader:
        input_token = input_token.to(device)
        mask = mask.to(device)
        
        preds = model(input_token, mask)
        preds = preds.tolist()
        for i in range(len(preds)):
            pred = preds[i]
            idx = student_id[i]
            if idx not in student_scores.keys():
                student_scores[idx] = np.array([pred])
            else:
                student_scores[idx] = np.concatenate((student_scores[idx], [pred]))

for key, value in student_scores.items():
    student_scores[key] = value.sum(axis=0)
print(student_scores)

In [None]:
df_submission = pd.DataFrame.from_dict(student_scores, orient='index')
df_submission = df_submission.reset_index()
df_submission.columns = ['student_id', 'content', 'wording']
df_submission.to_csv('submission.csv', index=False)
df_submission.head()