In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import nltk
import transformers
from datasets import Dataset, DatasetDict
from sklearn.metrics import cohen_kappa_score
import tqdm
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
df = pd.read_csv('data/train.csv')
df['label'] = df['score'].apply(lambda num: [1 if i == num - 1 else 0 for i in range(6)])
df.head()


Unnamed: 0,essay_id,full_text,score,label
0,000d118,Many people have car where they live. The thin...,3,"[0, 0, 1, 0, 0, 0]"
1,000fe60,I am a scientist at NASA that is discussing th...,3,"[0, 0, 1, 0, 0, 0]"
2,001ab80,People always wish they had the same technolog...,4,"[0, 0, 0, 1, 0, 0]"
3,001bdc0,"We all heard about Venus, the planet without a...",4,"[0, 0, 0, 1, 0, 0]"
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,"[0, 0, 1, 0, 0, 0]"


In [10]:
train,test = train_test_split(df, test_size=0.2, random_state=42)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

In [11]:
test

Dataset({
    features: ['essay_id', 'full_text', 'score', 'label', '__index_level_0__'],
    num_rows: 3462
})

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

PRE_TRAINED_MODEL_NAME = 'xlm-roberta-base'

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['full_text'], padding=True, truncation=True)
tokenized_train = train.map(tokenize, batched=True, batch_size=len(train))
tokenized_test = test.map(tokenize, batched=True, batch_size=len(test))

tokenized_dict = DatasetDict({'train': tokenized_train, 'test': tokenized_test})
tokenized_dict.set_format(type='torch', columns=['input_ids', 'attention_mask', 'score','label'])

Map: 100%|██████████| 13845/13845 [00:08<00:00, 1680.19 examples/s]
Map: 100%|██████████| 3462/3462 [00:01<00:00, 1978.83 examples/s]


In [13]:
class model_v0(nn.Module):
    def __init__(self):
        super(model_v0, self).__init__()
        self.model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME, torch_dtype=torch.float32)
        self.linear = torch.nn.Linear(768, 6)
        self.softmax = torch.nn.Softmax(dim=1)
    
    def forward(self, input):
        output = self.model(input_ids=input['input_ids'].to(device), attention_mask=input['attention_mask'].to(device))
        output = output.last_hidden_state[:,0,:].to(torch.float32)
        return self.softmax(self.linear(output))

In [14]:
train_loader = DataLoader(tokenized_dict['train'], batch_size=8, shuffle=True)
test_loader = DataLoader(tokenized_dict['test'], batch_size=8, shuffle=True)

In [15]:
def train(model, loss_fn, optimizer, train_loader, val_loader, epochs=3,testing=False):
    for epoch in range(1, epochs + 1):
        model.train()
        loss_train = 0.0
        for batch in tqdm.tqdm(train_loader, desc=f'Epoch {epoch} of {epochs}', total=len(train_loader)):
            target = batch['label'].to(device).to(torch.float32)
            optimizer.zero_grad()
            output = model(batch)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            loss_train += loss.item()
        loss_train = loss_train/len(train_loader)
        #evaluation
        if testing:
            model.eval()
            with torch.no_grad():
                loss_val = 0.0
                score = 0.0
                for batch in tqdm.tqdm(val_loader, desc=f'Testing: epoch {epoch} of {epochs}', total=len(val_loader)):
                    target = batch['label'].to(device).to(torch.float32)
                    output = model(batch)
                    loss = loss_fn(output, target)
                    loss_val += loss.item()
                    pred = torch.argmax(output, dim=1).cpu().numpy()
                    target = torch.argmax(target, dim=1).cpu().numpy()
                    score += cohen_kappa_score(pred,target, weights='quadratic')
                loss_val = loss_val/len(val_loader)
                score = score/len(val_loader)
            print(f'Epoch: {epoch}, Training Loss: {loss_train}, Validation Loss: {loss_val} | Cohen Kappa Score: {score}')
        else:
            print(f'Epoch: {epoch}, Training Loss: {loss_train}')

In [18]:
subset = tokenized_dict['train'].select(range(10))
subset_loader = DataLoader(subset, batch_size=8, shuffle=True)

In [19]:
model = model_v0().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
loss = torch.nn.CrossEntropyLoss()
train(model, loss, optimizer, subset_loader, test_loader,epochs=1)

Epoch 1 of 1: 100%|██████████| 2/2 [04:00<00:00, 120.50s/it]

Epoch: 1, Training Loss: 1.7902097702026367





In [17]:
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
loss_train = 0
for batch in train_loader:
    target = batch['label'].to(device).to(torch.float16)
    optimizer.zero_grad()
    output = model(batch)
    loss = loss_fn(output, target)
    loss.backward()
    optimizer.step()
    loss_train += loss.item()
    break

NameError: name 'model' is not defined

In [31]:
pred = torch.argmax(output, dim=1).cpu().numpy()
target = torch.argmax(target, dim=1).cpu().numpy()
score = cohen_kappa_score(pred,target, weights='quadratic')