In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("train.csv")

In [4]:
df.shape

(17307, 3)

In [5]:
np.unique(df['score'])

array([1, 2, 3, 4, 5, 6])

In [6]:
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [7]:
import re
def clean_essay(text: str) -> str:
    # Convert to lowercase
    text = text.lower()
    
    # Replace inverted/fancy quotes with normal or remove
    text = text.replace("“", "").replace("”", "")
    text = text.replace('"', "").replace("'", "")
    
    # Remove multiple spaces, tabs, and newlines
    text = re.sub(r"\s+", " ", text)  
    
    # Trim leading/trailing spaces
    text = text.strip()
    
    return text

# Apply cleaning
df["full_text"] = df["full_text"].astype(str).apply(clean_essay)

print(df[["full_text"]].head())

                                           full_text
0  many people have car where they live. the thin...
1  i am a scientist at nasa that is discussing th...
2  people always wish they had the same technolog...
3  we all heard about venus, the planet without a...
4  dear, state senator this is a letter to argue ...


In [8]:
np.unique(df['full_text'])

array(['(bennging)luke was brave and went out to sea.he knew he might not come back but he still did it.he wanted to be a seagoing cowboy then he had to out into the sea. his friend helped him out alot so now he is a brave soilder like his friend. (middle)luke didnt want to go out at sea at first but now he does because he is not scared because he in brave and did his job.he got to where he was going the day after the world war ll.luke was lucky that he made it alive because the boomed cloesed to where he was going to. (middle) luke was also brave because he wanted to go out at sea even if his friends and family didnt like it.he was very brave and now he is in the army protecting the world from danger and harm. his friend made a good choice and i would the samething if i was lukes friend. (middle) my middle is that you can always have a friend but is that friend your true friend.if it is then you will know and they will be there to help you anything that you need help with.just like lu

In [9]:
df["word_count"] = df['full_text'].astype(str).apply(lambda x: len(x.split()))
total_words = df["word_count"].sum()
print(total_words)

6370476


In [10]:
print(df[["word_count"]].describe())

        word_count
count  17307.00000
mean     368.08667
std      150.31376
min      150.00000
25%      253.00000
50%      344.00000
75%      452.00000
max     1656.00000


In [11]:
df.head()

Unnamed: 0,essay_id,full_text,score,word_count
0,000d118,many people have car where they live. the thin...,3,498
1,000fe60,i am a scientist at nasa that is discussing th...,3,332
2,001ab80,people always wish they had the same technolog...,4,550
3,001bdc0,"we all heard about venus, the planet without a...",4,451
4,002ba53,"dear, state senator this is a letter to argue ...",3,373


In [12]:
import nltk
from nltk.corpus import stopwords

# Download stopwords (only once)
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Regex patterns
URL_RE = re.compile(r"(https?://\S+|www\.\S+)")
HTML_RE = re.compile(r"<.*?>")  # keep only words and spaces

def clean_essay(text: str) -> str:
    text = text.lower()  # lowercase
    
    # Remove URLs and HTML
    text = URL_RE.sub(" ", text)
    text = HTML_RE.sub(" ", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Remove stopwords
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

# Apply cleaning
df["full_text"] = df["full_text"].astype(str).apply(clean_essay)


print(df[["full_text"]].head())

[nltk_data] Downloading package stopwords to C:\Users\Jinal
[nltk_data]     Vasita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                           full_text
0  many people car live. thing dont know use car ...
1  scientist nasa discussing face mars. explainin...
2  people always wish technology seen movies, bes...
3  heard venus, planet without almost oxygen eart...
4  dear, state senator letter argue favor keeping...


In [13]:
df["word_count"] = df['full_text'].astype(str).apply(lambda x: len(x.split()))
total_words = df["word_count"].sum()
print(total_words)

3323346


In [14]:
df.head()

Unnamed: 0,essay_id,full_text,score,word_count
0,000d118,many people car live. thing dont know use car ...,3,253
1,000fe60,scientist nasa discussing face mars. explainin...,3,144
2,001ab80,"people always wish technology seen movies, bes...",4,278
3,001bdc0,"heard venus, planet without almost oxygen eart...",4,257
4,002ba53,"dear, state senator letter argue favor keeping...",3,193


In [15]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
df.drop('essay_id',axis=1)

Unnamed: 0,full_text,score,word_count
0,many people car live. thing dont know use car ...,3,253
1,scientist nasa discussing face mars. explainin...,3,144
2,"people always wish technology seen movies, bes...",4,278
3,"heard venus, planet without almost oxygen eart...",4,257
4,"dear, state senator letter argue favor keeping...",3,193
...,...,...,...
17302,story challenge exploing venus informative pie...,2,77
17303,technology changed lot ways live today. nowada...,4,313
17304,dont like sitting around day great opportunity...,2,96
17305,"challenge exporing venus, author suggests stud...",1,133


In [17]:
df.drop('word_count',axis=1)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people car live. thing dont know use car ...,3
1,000fe60,scientist nasa discussing face mars. explainin...,3
2,001ab80,"people always wish technology seen movies, bes...",4
3,001bdc0,"heard venus, planet without almost oxygen eart...",4
4,002ba53,"dear, state senator letter argue favor keeping...",3
...,...,...,...
17302,ffd378d,story challenge exploing venus informative pie...,2
17303,ffddf1f,technology changed lot ways live today. nowada...,4
17304,fff016d,dont like sitting around day great opportunity...,2
17305,fffb49b,"challenge exporing venus, author suggests stud...",1


In [18]:
text_col = 'full_text'
score_col = 'score'

In [19]:
min_score = df[score_col].min()
max_score = df[score_col].max()
df['normalized_score'] = (df[score_col] - min_score) / (max_score - min_score)

In [None]:
token=""
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',token=token)

In [21]:
def tokenize_data(texts, max_len=256):
    """
    Tokenizes a list of texts and returns tensors for the model.
    """
    input_ids, attention_masks = [], []
    for text in texts:
        # Encode the text using the tokenizer
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
            max_length=max_len,           # Pad/truncate to this length
            padding='max_length',         # Pad to max_length
            truncation=True,              # Truncate to max_length
            return_attention_mask=True,   # Return attention mask
            return_tensors='pt'           # Return PyTorch tensors
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [22]:
# Tokenize the essays
input_ids, attention_masks = tokenize_data(df[text_col].tolist())
labels = torch.tensor(df['normalized_score'].values, dtype=torch.float32)

# Create a TensorDataset from the tokenized data and labels
dataset = TensorDataset(input_ids, attention_masks, labels)

In [23]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

Training set size: 13845
Validation set size: 3462


In [24]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [26]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=1,  # The number of output labels
    token=token,
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [27]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.MSELoss()

In [28]:
epochs = 5
for epoch in range(epochs):
    print(f"\n======== Epoch {epoch + 1} / {epochs} ========")
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # Calculate loss and perform backpropagation
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        # Update model parameters
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"  Average training loss: {avg_train_loss:.4f}")


  Average training loss: 0.0231

  Average training loss: 0.0169

  Average training loss: 0.0138

  Average training loss: 0.0105

  Average training loss: 0.0073


In [29]:
    print("\nRunning Validation...")
    model.eval()
    val_preds, val_labels = [], []

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # Move predictions and labels to CPU to handle with NumPy
        logits = outputs.logits.squeeze(1).cpu().numpy()
        labels = b_labels.cpu().numpy()

        val_preds.extend(logits)
        val_labels.extend(labels)

    # CONVERT LISTS TO NUMPY ARRAYS BEFORE CALCULATION
    val_preds = np.array(val_preds)
    val_labels = np.array(val_labels)

    # Denormalize the scores for meaningful evaluation
    denormalized_preds = val_preds * (max_score - min_score) + min_score
    denormalized_labels = val_labels * (max_score - min_score) + min_score

    # Calculate evaluation metrics
    mae = mean_absolute_error(denormalized_labels, denormalized_preds)
    r2 = r2_score(denormalized_labels, denormalized_preds)

    print(f"  Validation MAE: {mae:.2f}")
    print(f"  Validation R-squared: {r2:.2f}")


Running Validation...
  Validation MAE: 0.50
  Validation R-squared: 0.61


In [30]:
torch.save(model.state_dict(), "bert_essay_scoring_1.pt")
print("Model saved successfully!")

Model saved successfully!


In [2]:
def predict_score(text, model, tokenizer, device, min_score, max_score, max_len=256):
    """
    Predicts the score of a single essay.
    """
    # Put the model in evaluation mode
    model.eval()

    # Tokenize the input text
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Move tensors to the correct device
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    # Make the prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze().cpu().numpy()

    # Denormalize and round the score
    predicted_score = round(logits * (max_score - min_score) + min_score)

    return predicted_score

# Example usage
new_essay="When cars where made by humans they put this device that is to have the car be more alarmed to the driver and everyone else that is in the car, but there are still some cars that are very old and don't have the same menufactors that are placed in the old cars that are like 1800s and 1900s. I think that the old cars should be put into some car meuseum or somthing to remind us how the very first engine car ever work without having a carage that has to have a horse to use as wheels even though that was a very very very very very very long time before even engine cars even excisted in time. i think that we've impoved on the safety of everyone that is driving and that are still learning to drive and trying our hardest to keep people safe from harm but useally that doesn't happen, we would have some difficulties with the cars that we are using today still needs some work to be done with them. I think it's a good idea to put more control on the car and less for the driver that has to drive the car. The only reason why we build cars and buses and trains and etc is to make us get to things like a if your running late for something and you need to get there really fast you could take your car and get there as fast as you can so your not offically late and not get fired and have to lose your job cause you were late for work. That's why we have cars to rely on and not our feet the whole time but most times if we dont want to use our car we can always walk there thats somewhere close and not far to walk like the gas station or the park or anything or if your just to dang LAZY to even walk that far then you would use a car or if you dont own a car you always use the bus to go where you want to go. It's not that hard to spend a little money on a bus toll to get to where you want to go and really it's not that hard at all but if it's for something really important then GO BUY A CAR i mean yeah cars are exspenive to even afford one but really if you save money then wouldn't have no problem at all on buying a car that has to fit your needs and get to places that you need to go."
predicted_score = predict_score(new_essay, model, tokenizer, device, min_score, max_score)
print(f"\nNew essay text:\n{new_essay}")
print(f"Predicted score: {predicted_score}")

NameError: name 'model' is not defined

In [None]:
import torch

# ------------------------
# 1. Save model after training
# ------------------------
output_model_file = "bert_essay_scoring_1.pt"

# Save the model (state dict is enough if you reload with the same class)
torch.save(model.state_dict(), output_model_file)
print(f"Model saved to {output_model_file}")


# ------------------------
# 2. Load model for inference
# ------------------------
# Make sure you re-create the same model architecture before loading weights
from transformers import BertForSequenceClassification

# Example: regression head (1 output neuron)
loaded_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
loaded_model.load_state_dict(torch.load(output_model_file, map_location=device))
loaded_model.to(device)
loaded_model.eval()
print("Model loaded successfully for inference!")


# ------------------------
# 3. Prediction function
# ------------------------
def predict_score(text, model, tokenizer, device, min_score, max_score, max_len=256):
    """
    Predicts the score of a single essay.
    """
    model.eval()
    
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze().cpu().numpy()

    # Denormalize (if you normalized labels during training)
    predicted_score = round(logits * (max_score - min_score) + min_score)

    return predicted_score


# ------------------------
# 4. Example usage
# ------------------------
new_essay = "When cars where made by humans they put this device..."
predicted_score = predict_score(new_essay, loaded_model, tokenizer, device, min_score, max_score)

print(f"\nNew essay text:\n{new_essay}")
print(f"Predicted score: {predicted_score}")


NameError: name 'model' is not defined