In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import matplotlib.pyplot as plt

2023-05-03 11:54:50.244629: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m00:01[0mm0:01[0m
[?25hCollecting filelock
  Downloading filelock-3.12.0-py3-none-any.whl (10 kB)
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Installing collected packages: tokenizers, filelock, huggingface-hub, transformers
Successfully installed filelock-3.12.0 huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Load the data from the CSV file
df = pd.read_csv("bert_input_105.csv")

In [5]:
# Split the data into training and testing datasets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].values, df['total_cost'].values, test_size=0.2)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the texts and convert them to input IDs and attention masks
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=256)

In [6]:
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels.astype(np.float32))

val_inputs = torch.tensor(val_encodings['input_ids'])
val_masks = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(val_labels.astype(np.float32))


In [7]:
# Create a DataLoader for the training and validation datasets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


In [8]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
# Define the optimizer and the learning rate
optimizer = AdamW(model.parameters(), lr=4e-5)
# Define lists to store the validation loss and epochs
val_loss_list = []
epoch_list = []




In [10]:
# Train the model
epochs = 30

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for batch in val_dataloader:
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            total_loss += outputs.loss.item()
        
        avg_loss = total_loss / len(val_dataloader)
        val_loss_list.append(avg_loss)
        epoch_list.append(epoch+1)
        print('Epoch:', epoch+1, '\tValidation Loss:', avg_loss)
# Define the device to run the code
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


Epoch: 1 	Validation Loss: 7023.3890380859375
Epoch: 2 	Validation Loss: 6885.93603515625
Epoch: 3 	Validation Loss: 6768.652587890625
Epoch: 4 	Validation Loss: 6671.2257080078125
Epoch: 5 	Validation Loss: 6588.869873046875
Epoch: 6 	Validation Loss: 6512.185791015625
Epoch: 7 	Validation Loss: 6432.440185546875
Epoch: 8 	Validation Loss: 6366.9461669921875
Epoch: 9 	Validation Loss: 6301.5145263671875
Epoch: 10 	Validation Loss: 6250.058349609375
Epoch: 11 	Validation Loss: 6191.85302734375
Epoch: 12 	Validation Loss: 6153.570068359375
Epoch: 13 	Validation Loss: 6115.400146484375
Epoch: 14 	Validation Loss: 6069.88037109375
Epoch: 15 	Validation Loss: 6041.6546630859375
Epoch: 16 	Validation Loss: 6014.661865234375
Epoch: 17 	Validation Loss: 5980.7503662109375
Epoch: 18 	Validation Loss: 5957.921875
Epoch: 19 	Validation Loss: 5935.67578125
Epoch: 20 	Validation Loss: 5914.7811279296875
Epoch: 21 	Validation Loss: 5900.2841796875
Epoch: 22 	Validation Loss: 5879.704345703125
Epoch

In [45]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_texts = df['text'].values
    test_labels = df['total_cost'].values
    
    test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=256)
    
    test_inputs = torch.tensor(test_encodings['input_ids'])
    test_masks = torch.tensor(test_encodings['attention_mask'])
    test_labels = torch.tensor(test_labels.astype(np.float32))
    
    # Create a DataLoader for the test set
    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)
    
    # Evaluate the model on the test set
    total_cost_predictions = []
    total_cost_labels = []
    
    for batch in test_dataloader:
        batch_inputs, batch_masks, batch_labels = tuple(t.to(device) for t in batch)
        
        # Compute logits and probabilities
        batch_logits = model(batch_inputs, attention_mask=batch_masks)[0]
        batch_probs = torch.sigmoid(batch_logits)
        
        # Collect predictions and labels
        total_cost_predictions += batch_probs.flatten().tolist()
        total_cost_labels += batch_labels.flatten().tolist()
    
    # Convert predictions and labels to numpy arrays
    total_cost_predictions = np.array(total_cost_predictions)
    total_cost_labels = np.array(total_cost_labels)
    
    # Compute evaluation metrics
    mae = np.mean(np.abs(total_cost_predictions - total_cost_labels))
    mse = np.mean((total_cost_predictions - total_cost_labels) ** 2)
    rmse = np.sqrt(mse)
  
    
    # Print evaluation metrics
    print('MAE: {:.4f}'.format(mae))
    print('MSE: {:.4f}'.format(mse))
    print('RMSE: {:.4f}'.format(rmse))


MAE: 67.3558
MSE: 16671.0888
RMSE: 129.1166


In [7]:
# Plot the validation loss over epochs

plt.plot(epoch_list, val_loss_list, 'b', label='Validation Loss')
plt.title('Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()



NameError: name 'epoch_list' is not defined