In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

data_train = pd.read_csv('./cm_train.csv')
data_test = pd.read_csv('./cm_test.csv')

X_train = data_train['input']
y_train = data_train['label']

X_test = data_test['input']
y_test = data_test['label']

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

class EthicalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EthicalDataset(train_encodings, y_train)
test_dataset = EthicalDataset(test_encodings, y_test)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mluyi625[0m ([33mluyi625-seeking-alpha[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,0.6875
20,0.7079
30,0.6764
40,0.6608
50,0.6757
60,0.6838
70,0.6971
80,0.7027
90,0.7006
100,0.6722


TrainOutput(global_step=2610, training_loss=0.4542080504684156, metrics={'train_runtime': 1354.1142, 'train_samples_per_second': 30.817, 'train_steps_per_second': 1.927, 'total_flos': 2744906085043200.0, 'train_loss': 0.4542080504684156, 'epoch': 3.0})

In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `ppo_supervised` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `ppo

In [9]:
from huggingface_hub import HfApi
api = HfApi()

model_name = "ethical_supervised_bert"
repo_name = f"llptxx/{model_name}"
api.create_repo(repo_name, exist_ok=True)
model.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/llptxx/ethical_supervised_bert/commit/03338f2f82f90c955a5df0dd7e7e35da6b5a431c', commit_message='Upload BertForSequenceClassification', commit_description='', oid='03338f2f82f90c955a5df0dd7e7e35da6b5a431c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/llptxx/ethical_supervised_bert', endpoint='https://huggingface.co', repo_type='model', repo_id='llptxx/ethical_supervised_bert'), pr_revision=None, pr_num=None)

In [7]:
import json
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class EthicalDataset(Dataset):
    def __init__(self, json_file, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []
        with open(json_file, 'r') as f:
          for line in f:
            self.data.append(json.loads(line))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        item = self.data[idx]

        action1 = item['actions'][0]['description']
        action2 = item['actions'][1]['description']

        label = item['gold_label']

        encoding1 = self.tokenizer(
            action1,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        encoding2 = self.tokenizer(
            action2,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'action1': encoding1['input_ids'].squeeze(0),
            'attention_mask1': encoding1['attention_mask'].squeeze(0),
            'action2': encoding2['input_ids'].squeeze(0),
            'attention_mask2': encoding2['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float32)
        }

In [1]:
import torch.nn as nn

class RewardModel(nn.Module):
    def __init__(self, bert_model):
        super(RewardModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        reward = self.fc(pooled_output)
        return reward

In [9]:
from torch.utils.data import DataLoader
from transformers import BertModel
import torch.optim as optim

bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)

dataset = EthicalDataset(json_file='train.scruples-dilemmas.jsonl', tokenizer=tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

reward_model = RewardModel(bert_model)

optimizer = optim.Adam(reward_model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
reward_model.to(device)

for epoch in range(num_epochs):
    reward_model.train()
    total_loss = 0

    for batch in dataloader:
        action1 = batch['action1'].to(device)
        attention_mask1 = batch['attention_mask1'].to(device)
        action2 = batch['action2'].to(device)
        attention_mask2 = batch['attention_mask2'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        reward1 = reward_model(action1, attention_mask1)
        reward2 = reward_model(action2, attention_mask2)

        logits = reward1 - reward2
        loss = criterion(logits.squeeze(), labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Epoch 1/3, Loss: 0.6073780397035308
Epoch 2/3, Loss: 0.5036605472180803
Epoch 3/3, Loss: 0.40482453308873256


In [12]:
torch.save(reward_model.state_dict(), './results/reward_model_weights.pth')

In [2]:
import torch
import torch.nn as nn
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
import random
import gym
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification,BertModel


class EthicalDecisionEnv(gym.Env):
    def __init__(self, dilemmas, reward_model):
        super(EthicalDecisionEnv, self).__init__()
        self.dilemmas = dilemmas
        self.reward_model = reward_model
        self.current_dilemma_idx = 0
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


        max_length = 128
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(2 * max_length,)
        )
        self.action_space = gym.spaces.Discrete(2)

    def reset(self):
        self.current_dilemma_idx = np.random.randint(0, len(self.dilemmas))
        dilemma = self.dilemmas[self.current_dilemma_idx]
        encoding = self.tokenizer(
            dilemma, return_tensors='pt', padding='max_length', truncation=True, max_length=128
        )
        state = torch.cat([
            encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()
        ]).numpy()
        return state

    def step(self, action):
        dilemma = self.dilemmas[self.current_dilemma_idx]
        encoding = self.tokenizer(
            dilemma, return_tensors='pt', padding='max_length', truncation=True, max_length=128
        )
        state = torch.cat([
            encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()
        ])
        reward = self.reward_model(encoding['input_ids'], encoding['attention_mask']).item()
        done = True
        info = {}
        return state.numpy(), reward, done, info
    def seed(self, seed=None):
      np.random.seed(seed)
      return [seed]

In [5]:
from stable_baselines3.common.callbacks import ProgressBarCallback
import json
ethical_dilemmas = []

bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)
reward_model = RewardModel(bert_model)
reward_model.load_state_dict(torch.load('reward_model_weights.pth'))

def train_rl_policy(dilemmas, reward_model):
    env = make_vec_env(lambda: EthicalDecisionEnv(dilemmas, reward_model), n_envs=1)
    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=10000)
    return model

with open('./dilemmas/train.scruples-dilemmas.jsonl', 'r') as f:
    for line in f:
      ethical_dilemmas.append(json.loads(line)['actions'][0]['description'])
      ethical_dilemmas.append(json.loads(line)['actions'][1]['description'])

rl_policy = train_rl_policy(ethical_dilemmas, reward_model)
rl_policy.save("./ppo_ethical_decision_model")

  reward_model.load_state_dict(torch.load('reward_model_weights.pth'))


Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.953   |
| time/              |          |
|    fps             | 2        |
|    iterations      | 1        |
|    time_elapsed    | 991      |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1            |
|    ep_rew_mean          | -1.3         |
| time/                   |              |
|    fps                  | 2            |
|    iterations           | 2            |
|    time_elapsed         | 1986         |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0034130807 |
|    clip_fraction        | 0.00103      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.692       |
|    explained_variance   | -0.0494     