In [1]:
!pip install transformers
!pip install datasets transformers==4.28.0

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.3 MB/s[0m eta [36m0:00:0

In [1]:
import torch, gc, random
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/data_new.csv')

In [3]:

import string

def preprocess_text(text):
    # Remove punctuation and lowercase text
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    return text

In [4]:

df['text'] = df['text'].apply(preprocess_text)
# df['Answers'] = df['Answers'].apply(preprocess_text)

In [None]:

X = df['text']
y = df['wording']

max_length = 64

model_name = 'distilbert-base-uncased'


X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=0.1)


tokenizer = AutoTokenizer.from_pretrained(model_name)

# Encode the text
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)



class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = MakeTorchData(train_encodings, y_train.ravel())
valid_dataset = MakeTorchData(valid_encodings, y_test.ravel())

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels = 1).to("cuda")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.

In [23]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [24]:

training_args = TrainingArguments(
    output_dir ='./results',
    num_train_epochs = 50,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 20,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    logging_dir = './logs',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'rmse',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
)


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    compute_metrics = compute_metrics_for_regression,
)

l
trainer.train()



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,No log,0.631718,0.631718,0.794807,0.609032,0.468061,101.681028
2,No log,0.575188,0.575188,0.758411,0.575195,0.515662,99.234113
3,No log,0.545936,0.545936,0.738875,0.565398,0.540293,99.385144
4,No log,0.572622,0.572622,0.756718,0.591197,0.517822,99.496274
5,0.441100,0.594523,0.594523,0.771053,0.596441,0.49938,95.547877
6,0.441100,0.57291,0.57291,0.756908,0.581245,0.517579,96.188023
7,0.441100,0.598679,0.598679,0.773744,0.594964,0.49588,97.286633
8,0.441100,0.568414,0.568414,0.753932,0.582276,0.521365,99.225353
9,0.441100,0.582002,0.582002,0.762891,0.579839,0.509924,95.284061
10,0.157700,0.594492,0.594492,0.771033,0.588002,0.499407,96.370162


TrainOutput(global_step=5050, training_loss=0.08730380657875891, metrics={'train_runtime': 1839.6829, 'train_samples_per_second': 175.248, 'train_steps_per_second': 2.745, 'total_flos': 5338340962406400.0, 'train_loss': 0.08730380657875891, 'epoch': 50.0})

In [25]:

trainer.evaluate()

{'eval_loss': 0.6317175030708313,
 'eval_mse': 0.6317175030708313,
 'eval_rmse': 0.7948065996170044,
 'eval_mae': 0.6090317964553833,
 'eval_r2': 0.4680605613592024,
 'eval_smape': 101.68102771966527,
 'eval_runtime': 1.2997,
 'eval_samples_per_second': 551.663,
 'eval_steps_per_second': 27.699,
 'epoch': 50.0}