In [1]:
#Open Balanced data that was saved
import pandas as pd

df = pd.read_csv('balanced_data (1).csv')


Some usernames were manually labeled to improve the training of the BERT model. These usernames fall under vet category but are students and technicians thus should be classified under Others.

In [2]:
usernames_to_filter = [
    'Ecstatic-Operation85', 'matcha-fiend', 'Shemoose', 'Active_Pitch4104', 'almostdonestudent',
    'According-Anybody-91', 'DarthChily', 'cappy267', 'Few-Depth-3039', 'Serious_Passage_1260',
    'f4un4', 'mojoburquano', 'Play_Persevere', 'Timsterific24', 'Greyscale_cats', 'glindsaynz',
    'eptesicusfscus', 'Asleep_Leopard182', 'Julitania', 'Gutterrrslut', 'Western_Gift6401'
]

# Filter the DataFrame
filtered_df = df[df['username'].isin(usernames_to_filter)]

In [3]:
#Length of usernames that fall under this category
len(filtered_df)

21

Next, we resample our dataset getting ready for training. Undersampling the 'others' column since it has the most rows.

In [4]:
sampled_others = df[df['standardized_label'] == 'Others'].sample(n=140, random_state=42)
sampled_veterinarian = df[df['standardized_label'] == 'Veterinarian']
sampled_medical_doctor = df[df['standardized_label'] == 'Medical Doctor']

# Combine the samples
sampled_df = pd.concat([filtered_df, sampled_others, sampled_veterinarian, sampled_medical_doctor])

# Reset the index
sampled_df.reset_index(drop=True, inplace=True)
sampled_df.standardized_label.value_counts()

standardized_label
Others            161
Medical Doctor    143
Veterinarian      108
Name: count, dtype: int64

Installing some packages needed.

In [5]:
!pip install accelerate>=0.21.0

In [6]:
!pip install transformers[torch]



In [9]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


Beginning the process of model building.


In [16]:
#Import libraries
import optuna
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import KFold
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Sample dataset
data = sampled_df.copy()

# Extract features and labels
X = data['processed_comments']
y = data['standardized_label']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the custom dataset class
class RedditDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def objective(trial):
    # Define hyperparameters to tune
    num_train_epochs = trial.suggest_int('num_train_epochs', 6, 10)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-4, log=True)
    weight_decay = trial.suggest_float('weight_decay', 0.001, 0.1)
    gradient_accumulation_steps = trial.suggest_int('gradient_accumulation_steps', 1, 4)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.3)

    # Initialize KFold
    kf = KFold(n_splits=3)
    fold_losses = []

    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        # Tokenize the data
        train_encodings = tokenizer(list(X_train_fold), truncation=True, padding=True, max_length=512)
        val_encodings = tokenizer(list(X_val_fold), truncation=True, padding=True, max_length=512)

        # Create dataset objects
        train_dataset = RedditDataset(train_encodings, y_train_fold)
        val_dataset = RedditDataset(val_encodings, y_val_fold)

        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            learning_rate=learning_rate,
            per_device_eval_batch_size=8,
            warmup_steps=warmup_steps,
            weight_decay=weight_decay,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            gradient_accumulation_steps=gradient_accumulation_steps,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
        )

        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        trainer.train()
        eval_result = trainer.evaluate()
        fold_losses.append(eval_result['eval_loss'])

    return np.mean(fold_losses)

In [17]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=8)

print(f'Best trial: {study.best_trial.params}')

[I 2024-05-20 11:39:16,488] A new study created in memory with name: no-name-2e1e0564-7a00-4bf7-9028-060eebfa0d7e


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0912,1.073714,0.427184,0.337393,0.309034,0.427184
2,1.0815,1.065169,0.475728,0.38864,0.351136,0.475728
3,1.0924,1.050402,0.563107,0.472811,0.413299,0.563107
4,1.0538,1.028788,0.640777,0.54527,0.474771,0.640777
5,1.0569,0.994637,0.679612,0.582403,0.513802,0.679612
6,1.0058,0.942674,0.679612,0.582403,0.513802,0.679612
7,0.9529,0.902074,0.68932,0.591547,0.523919,0.68932
8,0.9311,0.87417,0.699029,0.600743,0.534434,0.699029


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1496,1.141227,0.330097,0.193634,0.232059,0.330097
2,1.1502,1.133883,0.339806,0.210323,0.264238,0.339806
3,1.1289,1.121653,0.427184,0.335401,0.356567,0.427184
4,1.1271,1.105458,0.446602,0.361267,0.342186,0.446602
5,1.0811,1.083418,0.504854,0.411099,0.368897,0.504854
6,1.0531,1.053156,0.514563,0.41877,0.372898,0.514563
7,1.0397,1.016618,0.524272,0.425156,0.370735,0.524272
8,0.9938,0.973677,0.514563,0.413617,0.350994,0.514563


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1331,1.162555,0.242718,0.133736,0.146343,0.242718
2,1.1378,1.154995,0.242718,0.133736,0.146343,0.242718
3,1.1301,1.142539,0.291262,0.205489,0.249864,0.291262
4,1.122,1.124933,0.368932,0.297379,0.322115,0.368932
5,1.0754,1.101451,0.398058,0.3283,0.331479,0.398058
6,1.056,1.070799,0.417476,0.345863,0.340946,0.417476
7,1.0375,1.034596,0.466019,0.386404,0.361375,0.466019
8,0.9749,0.987493,0.475728,0.391634,0.35868,0.475728


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-20 11:56:17,032] Trial 0 finished with value: 0.9451134006182352 and parameters: {'num_train_epochs': 8, 'per_device_train_batch_size': 8, 'learning_rate': 1.3585249676289742e-05, 'weight_decay': 0.016276045828884814, 'gradient_accumulation_steps': 2, 'warmup_steps': 984, 'dropout_rate': 0.26241814681628617}. Best is trial 0 with value: 0.9451134006182352.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.108974,0.271845,0.128736,0.515991,0.271845
1,No log,1.10724,0.271845,0.128736,0.515991,0.271845
3,1.152300,1.098963,0.291262,0.166145,0.517379,0.291262
4,1.164400,1.093413,0.320388,0.217218,0.519568,0.320388
6,1.129400,1.076697,0.368932,0.291325,0.523533,0.368932
7,1.129400,1.067286,0.446602,0.388616,0.53086,0.446602
9,1.119400,1.051228,0.504854,0.449321,0.537341,0.504854


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.225614,0.330097,0.163844,0.108964,0.330097
1,No log,1.221999,0.330097,0.163844,0.108964,0.330097
3,1.225000,1.203831,0.320388,0.160194,0.106796,0.320388
4,1.202000,1.191229,0.320388,0.160194,0.106796,0.320388
6,1.174400,1.150656,0.320388,0.175411,0.209191,0.320388
7,1.174400,1.127705,0.339806,0.220215,0.216011,0.339806
9,1.114600,1.097308,0.38835,0.288773,0.274652,0.38835


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.240085,0.262136,0.11292,0.071959,0.262136
1,No log,1.236217,0.262136,0.11292,0.071959,0.262136
3,1.212500,1.216457,0.271845,0.116208,0.0739,0.271845
4,1.199000,1.202809,0.271845,0.116208,0.0739,0.271845
6,1.163700,1.160043,0.281553,0.136009,0.433847,0.281553
7,1.163700,1.135413,0.300971,0.172137,0.293204,0.300971
9,1.106800,1.095992,0.368932,0.26319,0.271318,0.368932


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-20 12:19:07,360] Trial 1 finished with value: 1.0815092325210571 and parameters: {'num_train_epochs': 10, 'per_device_train_batch_size': 16, 'learning_rate': 2.1704683953538205e-05, 'weight_decay': 0.09522612095131558, 'gradient_accumulation_steps': 3, 'warmup_steps': 753, 'dropout_rate': 0.19805449653840806}. Best is trial 0 with value: 0.9451134006182352.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.228983,0.252427,0.105667,0.066819,0.252427
1,No log,1.218767,0.252427,0.105667,0.066819,0.252427
2,No log,1.20045,0.252427,0.105667,0.066819,0.252427
4,1.204400,1.12071,0.281553,0.147917,0.368511,0.281553
5,1.204400,1.070665,0.38835,0.304619,0.375564,0.38835
6,1.134200,1.04349,0.466019,0.396828,0.687298,0.466019


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.079325,0.31068,0.147285,0.096522,0.31068
1,No log,1.073006,0.31068,0.147285,0.096522,0.31068
2,No log,1.061763,0.31068,0.147285,0.096522,0.31068
4,1.045100,1.013397,0.330097,0.187269,0.787754,0.330097
5,1.045100,0.969005,0.398058,0.302463,0.795084,0.398058
6,0.993300,0.934131,0.466019,0.397071,0.765141,0.466019


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.062681,0.368932,0.211074,0.316063,0.368932
1,No log,1.056643,0.368932,0.211074,0.316063,0.368932
2,No log,1.045963,0.368932,0.211074,0.316063,0.368932
4,1.058400,0.995034,0.38835,0.248075,0.4061,0.38835
5,1.058400,0.954097,0.456311,0.353169,0.39288,0.456311
6,0.995400,0.930035,0.475728,0.389472,0.553738,0.475728


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-05-20 12:36:45,175] Trial 2 finished with value: 0.9692188103993734 and parameters: {'num_train_epochs': 7, 'per_device_train_batch_size': 16, 'learning_rate': 6.371575050728227e-05, 'weight_decay': 0.002426721321457547, 'gradient_accumulation_steps': 4, 'warmup_steps': 461, 'dropout_rate': 0.1534735338480325}. Best is trial 0 with value: 0.9451134006182352.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.029419,0.436893,0.271582,0.19703,0.436893
2,1.080900,0.971043,0.456311,0.311459,0.463107,0.456311
4,0.983700,0.849797,0.747573,0.749209,0.765704,0.747573
6,0.860100,0.751313,0.84466,0.845772,0.862612,0.84466
8,0.789300,0.685413,0.854369,0.855377,0.868852,0.854369


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.113966,0.359223,0.189875,0.129041,0.359223
2,1.131300,1.059145,0.359223,0.192627,0.131597,0.359223
4,1.008300,0.946317,0.61165,0.547806,0.678947,0.61165
6,0.912100,0.82771,0.815534,0.815185,0.827472,0.815534
8,0.819500,0.74017,0.864078,0.864078,0.868093,0.864078


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.114233,0.368932,0.198857,0.136111,0.368932
2,1.137600,1.059245,0.378641,0.220466,0.410651,0.378641
4,1.010200,0.935338,0.699029,0.660575,0.774919,0.699029
6,0.917000,0.829737,0.786408,0.783015,0.800898,0.786408
8,0.814200,0.749551,0.815534,0.817234,0.831936,0.815534


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-05-20 12:57:44,809] Trial 3 finished with value: 0.7250445286432902 and parameters: {'num_train_epochs': 9, 'per_device_train_batch_size': 8, 'learning_rate': 6.217281142576173e-05, 'weight_decay': 0.09013490409327989, 'gradient_accumulation_steps': 4, 'warmup_steps': 630, 'dropout_rate': 0.18086512033322746}. Best is trial 3 with value: 0.7250445286432902.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.152582,0.291262,0.131396,0.084834,0.291262
2,1.122000,1.064394,0.31068,0.173842,0.441537,0.31068
4,0.980000,0.887397,0.728155,0.716897,0.730287,0.728155
5,0.980000,0.847261,0.796117,0.794507,0.807391,0.796117


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.173347,0.281553,0.177159,0.146836,0.281553
2,1.160100,1.105351,0.407767,0.31042,0.256935,0.407767
4,1.035700,0.98766,0.543689,0.432194,0.36836,0.543689
5,1.035700,0.938976,0.592233,0.531723,0.783949,0.592233


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.163038,0.339806,0.223159,0.181637,0.339806
2,1.157400,1.103635,0.407767,0.30883,0.251685,0.407767
4,1.028800,0.991039,0.524272,0.422426,0.36764,0.524272
5,1.028800,0.945623,0.543689,0.459619,0.779269,0.543689


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-05-20 13:09:03,496] Trial 4 finished with value: 0.9106202324231466 and parameters: {'num_train_epochs': 6, 'per_device_train_batch_size': 16, 'learning_rate': 7.879837353059789e-05, 'weight_decay': 0.0478734694083212, 'gradient_accumulation_steps': 2, 'warmup_steps': 593, 'dropout_rate': 0.1178610496080143}. Best is trial 3 with value: 0.7250445286432902.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1913,1.126771,0.407767,0.307733,0.256896,0.407767
2,1.1808,1.112696,0.38835,0.304791,0.254216,0.38835
3,1.1494,1.091655,0.398058,0.322966,0.272526,0.398058
4,1.1442,1.0652,0.485437,0.411091,0.360389,0.485437
5,1.0962,1.026821,0.572816,0.493065,0.455289,0.572816
6,1.0758,0.973698,0.582524,0.506602,0.486946,0.582524
7,0.9925,0.91522,0.601942,0.538037,0.800242,0.601942
8,0.9354,0.835391,0.747573,0.761371,0.843517,0.747573


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1343,1.140343,0.330097,0.193634,0.232059,0.330097
2,1.149,1.130536,0.339806,0.220787,0.263056,0.339806
3,1.1085,1.113968,0.427184,0.338415,0.346814,0.427184
4,1.1247,1.091661,0.485437,0.395258,0.36059,0.485437
5,1.0724,1.059871,0.514563,0.41877,0.372898,0.514563
6,1.0335,1.0149,0.533981,0.433663,0.380669,0.533981
7,0.9762,0.961872,0.495146,0.396393,0.333568,0.495146
8,0.916,0.91335,0.475728,0.379075,0.316663,0.475728


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1187,1.161583,0.242718,0.133736,0.146343,0.242718
2,1.1262,1.151345,0.242718,0.133736,0.146343,0.242718
3,1.1064,1.133533,0.320388,0.242737,0.28401,0.320388
4,1.1077,1.108648,0.38835,0.319186,0.326363,0.38835
5,1.0617,1.074224,0.398058,0.329647,0.324885,0.398058
6,1.0313,1.027591,0.466019,0.386404,0.361375,0.466019
7,0.9821,0.97406,0.466019,0.384035,0.354801,0.466019
8,0.8912,0.919586,0.485437,0.399091,0.36246,0.485437


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-20 13:22:37,840] Trial 5 finished with value: 0.8894423246383667 and parameters: {'num_train_epochs': 8, 'per_device_train_batch_size': 16, 'learning_rate': 1.221257215023865e-05, 'weight_decay': 0.03874204174765752, 'gradient_accumulation_steps': 1, 'warmup_steps': 641, 'dropout_rate': 0.13587363857184645}. Best is trial 3 with value: 0.7250445286432902.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.105161,0.271845,0.128736,0.515991,0.271845
2,1.165100,1.064456,0.446602,0.388616,0.53086,0.446602
4,1.089400,0.974934,0.553398,0.494084,0.528639,0.553398
6,1.003700,0.825663,0.592233,0.525267,0.523918,0.592233
8,0.900600,0.627873,0.873786,0.877172,0.896398,0.873786
9,0.761200,0.57849,0.873786,0.877172,0.896398,0.873786


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.217863,0.330097,0.163844,0.108964,0.330097
2,1.199700,1.123743,0.349515,0.233929,0.225935,0.349515
4,1.070200,0.992912,0.495146,0.444156,0.658944,0.495146
6,0.958100,0.817008,0.796117,0.798045,0.82634,0.796117
8,0.829000,0.676135,0.864078,0.865553,0.874506,0.864078
9,0.696300,0.654763,0.864078,0.865553,0.874506,0.864078


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.232211,0.271845,0.116208,0.0739,0.271845
2,1.210100,1.137413,0.300971,0.172438,0.346303,0.300971
4,1.069700,1.000661,0.456311,0.361013,0.655809,0.456311
6,0.953600,0.863026,0.728155,0.733981,0.819268,0.728155
8,0.845400,0.68312,0.834951,0.839484,0.887331,0.834951
9,0.705400,0.666466,0.825243,0.828362,0.868015,0.825243


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-05-20 13:42:23,561] Trial 6 finished with value: 0.633239726225535 and parameters: {'num_train_epochs': 10, 'per_device_train_batch_size': 8, 'learning_rate': 7.891426108585595e-05, 'weight_decay': 0.04491877838405888, 'gradient_accumulation_steps': 4, 'warmup_steps': 733, 'dropout_rate': 0.1411093280797467}. Best is trial 6 with value: 0.633239726225535.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.230866,0.252427,0.105667,0.066819,0.252427
1,No log,1.22826,0.252427,0.105667,0.066819,0.252427
2,No log,1.223661,0.252427,0.105667,0.066819,0.252427
4,1.211800,1.205288,0.252427,0.105667,0.066819,0.252427
5,1.211800,1.193697,0.252427,0.105667,0.066819,0.252427
6,1.194600,1.179487,0.252427,0.105667,0.066819,0.252427
8,1.194600,1.1498,0.262136,0.125101,0.290782,0.262136


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.11842,0.359223,0.189875,0.129041,0.359223
1,No log,1.116918,0.359223,0.189875,0.129041,0.359223
2,No log,1.114308,0.359223,0.189875,0.129041,0.359223
4,1.137900,1.103928,0.359223,0.189875,0.129041,0.359223
5,1.137900,1.097248,0.359223,0.189875,0.129041,0.359223
6,1.119800,1.088951,0.359223,0.189875,0.129041,0.359223
8,1.119800,1.071261,0.359223,0.192627,0.131597,0.359223


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.118668,0.368932,0.198857,0.136111,0.368932
1,No log,1.117079,0.368932,0.198857,0.136111,0.368932
2,No log,1.11422,0.368932,0.198857,0.136111,0.368932
4,1.144500,1.102458,0.368932,0.198857,0.136111,0.368932
5,1.144500,1.094869,0.368932,0.198857,0.136111,0.368932
6,1.121300,1.085716,0.368932,0.198857,0.136111,0.368932
8,1.121300,1.066144,0.378641,0.220466,0.410651,0.378641


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-05-20 14:02:06,685] Trial 7 finished with value: 1.0957351525624592 and parameters: {'num_train_epochs': 9, 'per_device_train_batch_size': 16, 'learning_rate': 2.6785581369449074e-05, 'weight_decay': 0.044020545118521284, 'gradient_accumulation_steps': 4, 'warmup_steps': 755, 'dropout_rate': 0.13641904352734474}. Best is trial 6 with value: 0.633239726225535.


Best trial: {'num_train_epochs': 10, 'per_device_train_batch_size': 8, 'learning_rate': 7.891426108585595e-05, 'weight_decay': 0.04491877838405888, 'gradient_accumulation_steps': 4, 'warmup_steps': 733, 'dropout_rate': 0.1411093280797467}


In [25]:
# Extract the best hyperparameters
best_trial = study.best_trial

best_num_train_epochs = 14
best_per_device_train_batch_size = best_trial.params['per_device_train_batch_size']
best_learning_rate = best_trial.params['learning_rate']
best_weight_decay = best_trial.params['weight_decay']
best_gradient_accumulation_steps = best_trial.params['gradient_accumulation_steps']
best_warmup_steps = best_trial.params['warmup_steps']
best_dropout_rate = best_trial.params['dropout_rate']

# Tokenize the full dataset
full_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
full_train_dataset = RedditDataset(full_train_encodings, y_train)

# Tokenize the test dataset
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)
test_dataset = RedditDataset(test_encodings, y_test)

# Define training arguments using the best hyperparameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_num_train_epochs,
    per_device_train_batch_size=best_per_device_train_batch_size,
    learning_rate=best_learning_rate,
    per_device_eval_batch_size=8,
    warmup_steps=best_warmup_steps,
    weight_decay=best_weight_decay,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=best_gradient_accumulation_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Initialize the Trainer with the best hyperparameters
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.config.hidden_dropout_prob = best_dropout_rate
model.config.attention_probs_dropout_prob = best_dropout_rate

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate the model
trainer.train()
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.266932,0.184466,0.057457,0.034028,0.184466
1,1.194600,1.170342,0.23301,0.147116,0.629519,0.23301
2,1.154400,1.031501,0.436893,0.382233,0.676451,0.436893
4,0.976900,0.79136,0.796117,0.80673,0.874771,0.796117
5,0.849700,0.693206,0.834951,0.839761,0.88642,0.834951
6,0.727800,0.589654,0.834951,0.846583,0.905132,0.834951
8,0.523400,0.428866,0.864078,0.869696,0.907294,0.864078
9,0.445500,0.373711,0.864078,0.867974,0.90236,0.864078
10,0.359500,0.345324,0.864078,0.867974,0.90236,0.864078
12,0.269600,0.30151,0.893204,0.897098,0.919592,0.893204


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3015095293521881,
 'eval_accuracy': 0.8932038834951457,
 'eval_f1': 0.8970979993169215,
 'eval_precision': 0.9195917351257157,
 'eval_recall': 0.8932038834951457,
 'eval_runtime': 3.3302,
 'eval_samples_per_second': 30.929,
 'eval_steps_per_second': 3.904,
 'epoch': 12.923076923076923}

Saving the best model realized

In [26]:
save_directory = './fine-tuned-bert'
trainer.save_model(save_directory)  # Saves the model to the specified directory
tokenizer.save_pretrained(save_directory)  # Saves the tokenizer to the specified directory

print(f'Model and tokenizer saved to {save_directory}')

Model and tokenizer saved to ./fine-tuned-bert


Testing out the model on some sample comments

In [27]:
model_path = './fine-tuned-bert'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Assuming label_encoder is already defined and trained from previous steps
label_encoder = LabelEncoder()
label_encoder.fit(["Others", "Veterinarian", "Medical Doctor"])  # Ensure the order matches your labels

def classify_comment(comment):
    # Preprocess the input comment
    inputs = tokenizer(comment, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Decode the predictions to label names
    predicted_label = label_encoder.inverse_transform(predictions.cpu().numpy())[0]
    return predicted_label

# Example usage
new_comment = "I'm planning to become a vet because I love animals."
classification = classify_comment(new_comment)
print(f'Classification: {classification}')


Classification: Medical Doctor


In [28]:
classification = classify_comment("It's important to stay up-to-date on the latest advances in medical research.")
print(f'Classification: {classification}')

Classification: Medical Doctor


In [29]:
classification = classify_comment("I play bet.")
print(f'Classification: {classification}')

Classification: Others


In [30]:
classification = classify_comment("Hi, LVT here.")
print(f'Classification: {classification}')

Classification: Others
