In [1]:
!pip install transformers
!pip install torch


[0m

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler




In [3]:
data = pd.read_csv('/kaggle/input/cleaneddataset/cleaned_data_v2.csv')
data = data[:1000]


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EssayDataset(Dataset):
    def __init__(self, essays, scores, tokenizer, max_length):
        self.essays = essays
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.essays)

    def __getitem__(self, idx):
        essay = self.essays[idx]
        score = self.scores[idx]
        encoding = self.tokenizer.encode_plus(
            essay,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'score': torch.tensor(score, dtype=torch.long),
               'score': torch.tensor(score - 1, dtype=torch.long),
        }


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
!pip install tqdm


[0m

In [6]:
import time
from tqdm import tqdm

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
train_dataset = EssayDataset(data['essay'], data['scores'], tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=8, sampler=RandomSampler(train_dataset))
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(20):
    print(f"Epoch {epoch + 1}/{20}")
    start_time = time.time()
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=scores)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        progress_bar.set_description(f"Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    end_time = time.time()
    epoch_time = end_time - start_time
    print(f"Average Loss: {avg_loss:.4f} | Time: {epoch_time:.2f} seconds")


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/20


Loss: 0.6468: 100%|██████████| 125/125 [01:44<00:00,  1.19it/s]


Average Loss: 0.9581 | Time: 104.70 seconds
Epoch 2/20


Loss: 0.4653: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.6762 | Time: 105.86 seconds
Epoch 3/20


Loss: 0.5161: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5716 | Time: 105.65 seconds
Epoch 4/20


Loss: 0.4232: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5257 | Time: 105.63 seconds
Epoch 5/20


Loss: 0.2914: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5277 | Time: 105.71 seconds
Epoch 6/20


Loss: 0.3739: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5262 | Time: 105.80 seconds
Epoch 7/20


Loss: 0.5887: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5257 | Time: 105.75 seconds
Epoch 8/20


Loss: 0.2467: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5324 | Time: 105.88 seconds
Epoch 9/20


Loss: 0.5870: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5268 | Time: 105.73 seconds
Epoch 10/20


Loss: 0.4322: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5242 | Time: 105.78 seconds
Epoch 11/20


Loss: 0.3921: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5250 | Time: 105.82 seconds
Epoch 12/20


Loss: 0.2818: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5339 | Time: 105.80 seconds
Epoch 13/20


Loss: 0.6199: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5326 | Time: 105.67 seconds
Epoch 14/20


Loss: 0.4513: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5266 | Time: 105.91 seconds
Epoch 15/20


Loss: 0.5814: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5283 | Time: 105.97 seconds
Epoch 16/20


Loss: 0.3496: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5253 | Time: 105.81 seconds
Epoch 17/20


Loss: 0.3979: 100%|██████████| 125/125 [01:46<00:00,  1.18it/s]


Average Loss: 0.5378 | Time: 106.19 seconds
Epoch 18/20


Loss: 0.5866: 100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


Average Loss: 0.5291 | Time: 105.96 seconds
Epoch 19/20


Loss: 0.8008: 100%|██████████| 125/125 [01:46<00:00,  1.18it/s]


Average Loss: 0.5245 | Time: 106.32 seconds
Epoch 20/20


Loss: 0.7709: 100%|██████████| 125/125 [01:46<00:00,  1.17it/s]

Average Loss: 0.5282 | Time: 106.47 seconds





In [7]:
def predict_score(model, essay, tokenizer):
    model.eval()
    encoding = tokenizer.encode_plus(
        essay,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs[0]
    score = torch.argmax(logits, dim=1).item() + 1
    return score

new_essay = "Dear @LOCATION1, I think that computers have a negative affect on us! How many people have acess to a camputer daily in america.. @NUM1 and how many people go on at least an hour a @NUM2. That means that @NUM3 people cant exercise are wasting many are have the posibility of physical @CAPS1 that sound good to you? Think of everything you done when you write a letter. I got up and got all the materials and sit back down. After In done writing have to put all the materials away and then put the letter in the mailbox and walk all the way back. Well this is what I had to do. Now think how you write an email sitdown and move your fingers. Do you see the difference? @CAPS2 instead of getting a good walk to your friends house to talk to you just in him/er. Did you know that you can literally but from on your computer. Instead of around with this that can add anything you your computer gives away information they information lead to that and then everthing you have wouldbe gone. All this so you can go online. Believe it or not, you can phisically get hurt for being on the computer, to long. @NUM4 thing is that when you at a computer @CAPS2 what happened to my grandpa is that when you are in a wood chair all day you can get ardthritis in your muscle @CAPS3 thing th."
predicted_score = predict_score(model, new_essay, tokenizer)
print("Predicted Score:", predicted_score)

Predicted Score: 4
