<a href="https://colab.research.google.com/github/Huertas97/Sentiment_Analysis/blob/main/sst2_models/DL/SST2_DL_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Regarding the Deep Learning models, all the models used are Transformer-based modelsfrom Hugging Face Transformers library. For the English SST2 binary classification task,the Transformer-based model explored are XLM-RoBERTa base size, DistilBert multilingualcased base size, DistilRoBERTa base size, and DistilBert base size fine-tuned for NLI and STSBenchmark tasks.

SST2 has: 

| Train | Dev | Test |
|-------|-----|------|
| 67349 | 872 | 1821 |

0 = Negative

1 = Positive

In [None]:
!pip install --upgrade wandb
!pip install transformers
!pip install seqeval
!pip install tensorboardx
!pip install -U simpletransformers
!pip install tqdm

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/ca/5e/9df94df3bfee51b92b54a5e6fa277d6e1fcdf1f27b1872214b98f55ec0f7/wandb-0.10.12-py2.py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 20.1MB/s eta 0:00:01[K     |▍                               | 20kB 16.0MB/s eta 0:00:01[K     |▌                               | 30kB 13.0MB/s eta 0:00:01[K     |▊                               | 40kB 12.2MB/s eta 0:00:01[K     |█                               | 51kB 8.0MB/s eta 0:00:01[K     |█                               | 61kB 8.7MB/s eta 0:00:01[K     |█▎                              | 71kB 8.8MB/s eta 0:00:01[K     |█▌                              | 81kB 8.9MB/s eta 0:00:01[K     |█▋                              | 92kB 9.0MB/s eta 0:00:01[K     |█▉                              | 102kB 7.7MB/s eta 0:00:01[K     |██                              | 112kB 7.7MB/s eta 0:00:01[K     |██▏                             | 122kB 7.7MB/s e



# Load the SST2 data 

Clone the GitHub repository

In [None]:
# Clone the repository and all the dependencies
!git clone https://github.com/Huertas97/Sentiment_Analysis.git

Cloning into 'Sentiment_Analysis'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 30 (delta 8), reused 23 (delta 4), pack-reused 0[K
Unpacking objects: 100% (30/30), done.


In [None]:
import io
import pandas as pd
# Load the data from SST2
def loadFile(fpath):
        sst_data = {'X': [], 'y': []}
        with io.open(fpath, 'r', encoding='utf-8') as f:
            for line in f:
                    sample = line.strip().split('\t')
                    sst_data['y'].append(int(sample[1]))
                    sst_data['X'].append(sample[0])
        assert max(sst_data['y']) == 2 - 1
        return sst_data

In [None]:
sst2_train = loadFile("/content/Sentiment_Analysis/sst_2_data/sentiment-train")
sst2_df_train = pd.DataFrame( {"text": sst2_train["X"], 
               "labels": sst2_train["y"]} )

sst2_dev = loadFile("/content/Sentiment_Analysis/sst_2_data/sentiment-dev")
sst2_df_dev = pd.DataFrame( {"text": sst2_dev["X"], 
               "labels": sst2_dev["y"]} )

sst2_test = loadFile("/content/Sentiment_Analysis/sst_2_data/sentiment-test")
sst2_df_test = pd.DataFrame( {"text": sst2_test["X"], 
               "labels": sst2_test["y"]} )

## Is the data balanced?

In [None]:
sst2_df_train.groupby(["labels"], as_index=False).agg("count")

Unnamed: 0,labels,text
0,0,29780
1,1,37569


In [None]:
sst2_df_dev.groupby(["labels"], as_index=False).agg("count")

Unnamed: 0,labels,text
0,0,428
1,1,444


In [None]:
sst2_df_test.groupby(["labels"], as_index=False).agg("count")

Unnamed: 0,labels,text
0,0,912
1,1,909


Here you can use any model from simpletransformers and train it. The hyperparameters and models available to train and how to train them is available at [Simpletransformers docs](https://simpletransformers.ai/)

In [None]:
import logging

import pandas as pd
import sklearn

import wandb
from simpletransformers.classification import (
    ClassificationArgs,
    ClassificationModel,
)

sweep_config = {
    "name": "xlm-roberta-base_2",
    "method": "bayes",  # grid, random
    # "metric": {"name": "train_loss", "goal": "minimize"},
    "metric": {"name": "accuracy", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {  
                             "values" : [2] #10
        },
        "learning_rate": {
            "min":0, "max":1e-5 #  5e-5
        }, 
        'weight_decay': {
            "min":0.0005, "max":0.01 # 0.05, 0.01
        },

        "train_batch_size" : {"values":[32]},
        "gradient_accumulation_steps": {"values": [3, 5]},
        "scheduler" : {"values": ["constant_schedule_with_warmup",
                                  "polynomial_decay_schedule_with_warmup",
                                  "cosine_schedule_with_warmup"
                                  ]}
    },
    "early_terminate": {"type": "hyperband", "min_iter": 6,}
}

sweep_id = wandb.sweep(sweep_config, project="SST2-pruebas")


# Logger
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Args
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.max_seq_length = 256
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 16 # Train = 63749 / 16 = 42010 batches in 1 epoch
model_args.eval_batch_size = 8
model_args.evaluate_during_training_verbose = True
model_args.evaluate_during_training = True
model_args.best_model_dir = "outputs/best_model" # lower eval_loss
model_args.evaluate_during_training_steps = 2000
model_args.save_model_every_epoch = False 
model_args.save_eval_checkpoints = False # We dont have enough space
model_args.use_cached_eval_features = True
model_args.train_custom_parameters_only = False # Train specific parameters? No, train all parameters
model_args.gradient_accumulation_steps = 2 # The number of training steps (batches) to execute before performing a optimizer.step(). More = less time 



# model_args.labels_list = ["true", "false"]
model_args.wandb_project = "SST2-train"

def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel(
        'roberta', 
        "distilroberta-base",
        use_cuda=True,
        args=model_args,
        num_labels=2, 
        sweep_config=wandb.config,
    )

    # Train the model and evalaute. Select best model according to evalaution
    model.train_model(sst2_df_train, 
                      eval_df=sst2_df_dev,  
                      acc=sklearn.metrics.accuracy_score)

    # Evaluate the model
    # model.eval_model(sst2_df_test, acc=sklearn.metrics.accuracy_score, 
                    #  verbose=True)

    # Sync wandb
    wandb.join()


wandb.agent(sweep_id, train)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
INFO:wandb.agents.pyagent:Starting sweep agent: entity=None, project=None, count=None


Create sweep with ID: irk0q576
Sweep URL: https://wandb.ai/huertas_97/SST2-pruebas/sweeps/irk0q576


[34m[1mwandb[0m: Agent Starting Run: xm9f8pc9 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 6.821600492325831e-08
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.0046927561283649135
[34m[1mwandb[0m: Currently logged in as: [33mhuertas_97[0m (use `wandb login --relogin` to force relogin)


INFO:filelock:Lock 139784044892232 acquired on /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46.lock


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

INFO:filelock:Lock 139784044892232 released on /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46.lock
INFO:filelock:Lock 139784044892232 acquired on /root/.cache/huggingface/transformers/7a0115a4c463f49bc7ab011872fc4a4b81be681a0434075955d29ac3388e225b.a6127d76576e81475313180aceb31a8688f7a649b80e380d26b5d30302dc83c1.lock


Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

INFO:filelock:Lock 139784044892232 released on /root/.cache/huggingface/transformers/7a0115a4c463f49bc7ab011872fc4a4b81be681a0434075955d29ac3388e225b.a6127d76576e81475313180aceb31a8688f7a649b80e380d26b5d30302dc83c1.lock
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

INFO:filelock:Lock 139781787834352 released on /root/.cache/huggingface/transformers/23e0f7484fc8a320856b168861166b48c2976bb4e0861602422e1b0c3fe5bf61.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO:filelock:Lock 139781787834352 acquired on /root/.cache/huggingface/transformers/c7e8020011da613ff5a9175ddad64cd47238a9525db975eb50ecb965e9f7302f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

INFO:filelock:Lock 139781787834352 released on /root/.cache/huggingface/transformers/c7e8020011da613ff5a9175ddad64cd47238a9525db975eb50ecb965e9f7302f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'tp': 444, 'tn': 0, 'fp': 428, 'fn': 0, 'acc': 0.5091743119266054, 'eval_loss': 0.7170938026467595}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'tp': 444, 'tn': 0, 'fp': 428, 'fn': 0, 'acc': 0.5091743119266054, 'eval_loss': 0.7069177058858609}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.65271
lr,0.0
global_step,842.0
_step,17.0
_runtime,1226.0
_timestamp,1609760539.0
tp,444.0
tn,0.0
fp,428.0
fn,0.0


0,1
Training loss,██▃▅▁▂▆▄▅▅▃▅█▅▃▄
lr,▁███████████████
global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
tp,▁▁
tn,▁▁
fp,▁▁
fn,▁▁


[34m[1mwandb[0m: Agent Starting Run: gghu0suk with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 3
[34m[1mwandb[0m: 	learning_rate: 7.667944504212015e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: cosine_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.009059889995270654


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8120763302106621, 'tp': 409, 'tn': 381, 'fp': 47, 'fn': 35, 'acc': 0.9059633027522935, 'eval_loss': 0.23664745150151056}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8095821174009153, 'tp': 405, 'tn': 384, 'fp': 44, 'fn': 39, 'acc': 0.9048165137614679, 'eval_loss': 0.23900163988796397}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.31301
lr,0.0
global_step,1402.0
_step,29.0
_runtime,1233.0
_timestamp,1609761781.0
tp,405.0
tn,384.0
fp,44.0
fn,39.0


0,1
Training loss,█▅▄▄▂▂▂▂▄▃▃▂▂▁▃▃▄▃▂▄▁▃▁▂▃▂▂▄
lr,█████▇▇▇▇▆▆▆▅▅▄▄▄▃▃▂▂▂▂▁▁▁▁▁
global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: rpnpn3mo with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 3
[34m[1mwandb[0m: 	learning_rate: 1.1160845847504799e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.0037592216352824886


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7640949263412694, 'tp': 401, 'tn': 368, 'fp': 60, 'fn': 43, 'acc': 0.8818807339449541, 'eval_loss': 0.28195505435450363}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7889412265635786, 'tp': 401, 'tn': 379, 'fp': 49, 'fn': 43, 'acc': 0.8944954128440367, 'eval_loss': 0.2524562614773392}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.3682
lr,0.0
global_step,1402.0
_step,29.0
_runtime,1225.0
_timestamp,1609763012.0
tp,401.0
tn,379.0
fp,49.0
fn,43.0


0,1
Training loss,▇█▇▇▆▃▂▄▄▃▃▃▂▁▂▃▄▃▃▃▂▄▁▃▄▄▃▄
lr,▁███████████████████████████
global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
tp,▁▁
tn,▁█
fp,█▁
fn,▁▁


[34m[1mwandb[0m: Agent Starting Run: 0voxd0i5 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 3
[34m[1mwandb[0m: 	learning_rate: 8.735561911003484e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: cosine_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.00141310832631597


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8120763302106621, 'tp': 409, 'tn': 381, 'fp': 47, 'fn': 35, 'acc': 0.9059633027522935, 'eval_loss': 0.23721734289114083}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8142079103795918, 'tp': 407, 'tn': 384, 'fp': 44, 'fn': 37, 'acc': 0.9071100917431193, 'eval_loss': 0.24035267524609188}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.31101
lr,0.0
global_step,1402.0
_step,29.0
_runtime,1208.0
_timestamp,1609764230.0
tp,407.0
tn,384.0
fp,44.0
fn,37.0


0,1
Training loss,█▅▄▄▂▂▂▃▄▃▃▂▂▁▂▃▄▃▂▄▁▃▁▂▃▂▂▄
lr,█████▇▇▇▇▆▆▆▅▅▄▄▄▃▃▂▂▂▂▁▁▁▁▁
global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: 5p9oodcw with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 6.656311230482887e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.003385323944370692


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8035291181835673, 'tp': 412, 'tn': 374, 'fp': 54, 'fn': 32, 'acc': 0.9013761467889908, 'eval_loss': 0.24155889205429532}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8281653350683839, 'tp': 402, 'tn': 395, 'fp': 33, 'fn': 42, 'acc': 0.9139908256880734, 'eval_loss': 0.25282240294800096}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.36027
lr,1e-05
global_step,842.0
_step,17.0
_runtime,1186.0
_timestamp,1609765422.0
tp,402.0
tn,395.0
fp,33.0
fn,42.0


0,1
Training loss,█▂▂▃▄▂▂▅▂▅▁▁▃▂▁▄
lr,▁███████████████
global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: t2y8usrl with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 4.721561288792744e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.006151007426508764


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7854966483402703, 'tp': 410, 'tn': 368, 'fp': 60, 'fn': 34, 'acc': 0.8922018348623854, 'eval_loss': 0.25052550947720853}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8096622458273856, 'tp': 400, 'tn': 389, 'fp': 39, 'fn': 44, 'acc': 0.9048165137614679, 'eval_loss': 0.24198504275890118}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.39117
lr,0.0
global_step,842.0
_step,17.0
_runtime,1186.0
_timestamp,1609766614.0
tp,400.0
tn,389.0
fp,39.0
fn,44.0


0,1
Training loss,█▃▂▃▄▁▁▄▂▅▂▁▃▁▁▄
lr,▁███████████████
global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: po0e9tan with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 4.056532236292127e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: cosine_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.005212032697426896


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7849775833116024, 'tp': 407, 'tn': 371, 'fp': 57, 'fn': 37, 'acc': 0.8922018348623854, 'eval_loss': 0.25251234374051795}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7937967791331788, 'tp': 406, 'tn': 376, 'fp': 52, 'fn': 38, 'acc': 0.8967889908256881, 'eval_loss': 0.24233012697702155}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.45862
lr,0.0
global_step,842.0
_step,17.0
_runtime,1205.0
_timestamp,1609767829.0
tp,406.0
tn,376.0
fp,52.0
fn,38.0


0,1
Training loss,█▄▃▃▄▁▁▃▂▅▂▂▃▁▁▄
lr,███▇▇▆▆▅▄▄▃▂▂▂▁▁
global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: 17z03br8 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 6.322107306183069e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: cosine_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.0032341229276420464


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.796045555925553, 'tp': 406, 'tn': 377, 'fp': 51, 'fn': 38, 'acc': 0.8979357798165137, 'eval_loss': 0.2457629206821049}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8050780821623625, 'tp': 406, 'tn': 381, 'fp': 47, 'fn': 38, 'acc': 0.9025229357798165, 'eval_loss': 0.23604631613632407}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.43332
lr,0.0
global_step,842.0
_step,17.0
_runtime,1208.0
_timestamp,1609769047.0
tp,406.0
tn,381.0
fp,47.0
fn,38.0


0,1
Training loss,█▁▂▂▄▁▁▄▂▅▂▁▃▁▁▄
lr,███▇▇▆▆▅▄▄▃▂▂▂▁▁
global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
tp,▁▁
tn,▁█
fp,█▁
fn,▁▁


[34m[1mwandb[0m: Agent Starting Run: gwfmldd3 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 3
[34m[1mwandb[0m: 	learning_rate: 5.303983427554373e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: cosine_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.0015229085034022295


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.805232074609112, 'tp': 408, 'tn': 379, 'fp': 49, 'fn': 36, 'acc': 0.9025229357798165, 'eval_loss': 0.23865828819470394}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8027547146169012, 'tp': 405, 'tn': 381, 'fp': 47, 'fn': 39, 'acc': 0.9013761467889908, 'eval_loss': 0.2356006168085364}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.32793
lr,0.0
global_step,1402.0
_step,29.0
_runtime,1230.0
_timestamp,1609770287.0
tp,405.0
tn,381.0
fp,47.0
fn,39.0


0,1
Training loss,█▆▄▅▃▂▂▃▄▃▃▃▂▁▂▃▄▃▃▄▁▃▁▂▄▃▃▄
lr,█████▇▇▇▇▆▆▆▅▅▄▄▄▃▃▂▂▂▂▁▁▁▁▁
global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: gqfrd05c with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 2.6791175216841314e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.004771844427553388


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7672887641733004, 'tp': 407, 'tn': 363, 'fp': 65, 'fn': 37, 'acc': 0.8830275229357798, 'eval_loss': 0.2661074406441745}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7935071126217067, 'tp': 401, 'tn': 381, 'fp': 47, 'fn': 43, 'acc': 0.8967889908256881, 'eval_loss': 0.24052131595517243}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.45134
lr,0.0
global_step,842.0
_step,17.0
_runtime,1212.0
_timestamp,1609771509.0
tp,401.0
tn,381.0
fp,47.0
fn,43.0


0,1
Training loss,██▃▄▄▂▂▄▂▅▂▂▃▁▁▄
lr,▁███████████████
global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: 6041ub18 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 3
[34m[1mwandb[0m: 	learning_rate: 2.047740156392698e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.002263145201168398


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7743017083757797, 'tp': 409, 'tn': 364, 'fp': 64, 'fn': 35, 'acc': 0.8864678899082569, 'eval_loss': 0.2631113874392772}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7981225619669111, 'tp': 399, 'tn': 385, 'fp': 43, 'fn': 45, 'acc': 0.8990825688073395, 'eval_loss': 0.24187598524427195}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.3281
lr,0.0
global_step,1402.0
_step,29.0
_runtime,1233.0
_timestamp,1609772752.0
tp,399.0
tn,385.0
fp,43.0
fn,45.0


0,1
Training loss,▇█▇▅▅▂▂▃▄▃▃▃▃▁▂▃▄▃▃▃▂▃▁▂▄▃▃▄
lr,▁███████████████████████████
global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: l8ildhd9 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 3
[34m[1mwandb[0m: 	learning_rate: 4.49942347424062e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: cosine_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.004101925688474225


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8005542687303825, 'tp': 406, 'tn': 379, 'fp': 49, 'fn': 38, 'acc': 0.9002293577981652, 'eval_loss': 0.24237376380575085}


Running Epoch 1 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8027547146169012, 'tp': 405, 'tn': 381, 'fp': 47, 'fn': 39, 'acc': 0.9013761467889908, 'eval_loss': 0.23705393885937306}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.33858
lr,0.0
global_step,1402.0
_step,29.0
_runtime,1233.0
_timestamp,1609773996.0
tp,405.0
tn,381.0
fp,47.0
fn,39.0


0,1
Training loss,█▇▅▅▃▁▂▃▄▃▃▃▂▁▂▃▄▃▃▄▁▃▁▂▄▃▂▄
lr,█████▇▇▇▇▆▆▆▅▅▄▄▄▃▃▂▂▂▂▁▁▁▁▁
global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
tp,█▁
tn,▁█
fp,█▁
fn,▁█


[34m[1mwandb[0m: Agent Starting Run: l3p1mnu6 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 5
[34m[1mwandb[0m: 	learning_rate: 4.2501910329574946e-06
[34m[1mwandb[0m: 	num_train_epochs: 2
[34m[1mwandb[0m: 	scheduler: constant_schedule_with_warmup
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.003510981399368896


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2105 [00:00<?, ?it/s]

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# Train final model

Once we have selected the model and best hyperparameters, we train it and log the metrics in W&B. 

In [None]:
import logging

import pandas as pd
import sklearn

import wandb
from simpletransformers.classification import (
    ClassificationArgs,
    ClassificationModel,
)



# Logger
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Args
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.max_seq_length = 256
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 16 # Train = 63749 / 16 = 42010 batches in 1 epoch
model_args.eval_batch_size = 8
model_args.evaluate_during_training_verbose = True
model_args.evaluate_during_training = True
model_args.best_model_dir = "outputs/best_model" # lower eval_loss
model_args.evaluate_during_training_steps = 1000
model_args.save_model_every_epoch = False 
model_args.save_eval_checkpoints = True # We dont have enough space
model_args.use_cached_eval_features = True
model_args.train_custom_parameters_only = False # Train specific parameters? No, train all parameters
model_args.gradient_accumulation_steps = 2 # The number of training steps (batches) to execute before performing a optimizer.step(). More = less time 


model_args.wandb_project = "RTE - Hyperparameter Optimization"
model_args.wandb_kwargs = {"name": "vanilla"}


# Hyperparameters
model_args.learning_rate = 0.0000057
model_args.num_train_epochs = 2
model_args.weight_decay = 0.0005


# Create a TransformerModel
model = ClassificationModel(
        'roberta', 
        "distilroberta-base",
        use_cuda=True,
        args=model_args,
        num_labels=2, 
    )

# Train the model and evalaute. Select best model according to evalaution
model.train_model(sst2_df_train, 
                      eval_df=sst2_df_dev,  
                      acc=sklearn.metrics.accuracy_score)

# Evaluate the model
model.eval_model(sst2_df_test, acc=sklearn.metrics.accuracy_score, 
                 verbose=True)



INFO:filelock:Lock 140584132087480 acquired on /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46.lock


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

INFO:filelock:Lock 140584132087480 released on /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46.lock
INFO:filelock:Lock 140584132221528 acquired on /root/.cache/huggingface/transformers/7a0115a4c463f49bc7ab011872fc4a4b81be681a0434075955d29ac3388e225b.a6127d76576e81475313180aceb31a8688f7a649b80e380d26b5d30302dc83c1.lock


Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

INFO:filelock:Lock 140584132221528 released on /root/.cache/huggingface/transformers/7a0115a4c463f49bc7ab011872fc4a4b81be681a0434075955d29ac3388e225b.a6127d76576e81475313180aceb31a8688f7a649b80e380d26b5d30302dc83c1.lock
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

INFO:filelock:Lock 140584118273584 released on /root/.cache/huggingface/transformers/23e0f7484fc8a320856b168861166b48c2976bb4e0861602422e1b0c3fe5bf61.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO:filelock:Lock 140584118211416 acquired on /root/.cache/huggingface/transformers/c7e8020011da613ff5a9175ddad64cd47238a9525db975eb50ecb965e9f7302f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

INFO:filelock:Lock 140584118211416 released on /root/.cache/huggingface/transformers/c7e8020011da613ff5a9175ddad64cd47238a9525db975eb50ecb965e9f7302f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/67349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Initializing WandB run for training.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 2:   0%|          | 0/4210 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7764960604021648, 'tp': 409, 'tn': 365, 'fp': 63, 'fn': 35, 'acc': 0.8876146788990825, 'eval_loss': 0.2701502483286852}
INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8118657807619595, 'tp': 405, 'tn': 385, 'fp': 43, 'fn': 39, 'acc': 0.9059633027522935, 'eval_loss': 0.23816452551325526}
INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8164554477970227, 'tp': 406, 'tn': 386, 'fp': 42, 'fn': 38, 'acc': 0.908256880733945, 'eval_loss': 0.22790221981020695}


Running Epoch 1 of 2:   0%|          | 0/4210 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8103432095785579, 'tp': 413, 'tn': 376, 'fp': 52, 'fn': 31, 'acc': 0.9048165137614679, 'eval_loss': 0.2780458481102293}
INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8054543283740327, 'tp': 410, 'tn': 377, 'fp': 51, 'fn': 34, 'acc': 0.9025229357798165, 'eval_loss': 0.25596597687546335}
INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_872
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8099289794589316, 'tp': 410, 'tn': 379, 'fp': 49, 'fn': 34, 'acc': 0.9048165137614679, 'eval_loss': 0.2526378021618232}
INFO:simpletransformers.classification.

  0%|          | 0/1821 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/228 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Initializing WandB run for evaluation.


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.03752
lr,0.0
global_step,4210.0
_step,89.0
_runtime,1337.0
_timestamp,1609407428.0
tp,410.0
tn,379.0
fp,49.0
fn,34.0


0,1
Training loss,█▆▃▇▃▄▂▃▂▃▃▂▄▃▂▂▂▁▁▁▁▁▁▁▂▁▂▃▂▂▂▃▁▂▂▃▂▁▃▁
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
tp,▅▁▂█▅▅
tn,▁██▅▅▆
fp,█▁▁▄▄▃
fn,▅█▇▁▄▄


INFO:simpletransformers.classification.classification_model:{'mcc': 0.8635984931439648, 'tp': 867, 'tn': 829, 'fp': 83, 'fn': 42, 'acc': 0.9313563975837452, 'eval_loss': 0.18159797959940574}


({'acc': 0.9313563975837452,
  'eval_loss': 0.18159797959940574,
  'fn': 42,
  'fp': 83,
  'mcc': 0.8635984931439648,
  'tn': 829,
  'tp': 867},
 array([[ 0.57324219, -0.64160156],
        [ 2.59570312, -2.45507812],
        [-0.60107422,  0.55175781],
        ...,
        [-3.25976562,  3.1953125 ],
        [-2.23242188,  2.33398438],
        [-1.49023438,  1.68359375]]),
 [<simpletransformers.classification.classification_utils.InputExample at 0x7fdc494a06d8>,
  <simpletransformers.classification.classification_utils.InputExample at 0x7fdc494a0eb8>,
  <simpletransformers.classification.classification_utils.InputExample at 0x7fdc4a6ecc18>,
  <simpletransformers.classification.classification_utils.InputExample at 0x7fdc4a6ec358>,
  <simpletransformers.classification.classification_utils.InputExample at 0x7fdc4a6ec2b0>,
  <simpletransformers.classification.classification_utils.InputExample at 0x7fdc4a6ec278>,
  <simpletransformers.classification.classification_utils.InputExample at 0x7f

In [None]:
def f1_multiclass(labels, preds):
    return sklearn.metrics.f1_score(labels, preds)
result, model_outputs, wrong_predictions = model.eval_model(sst2_df_test, 
                                                            acc=sklearn.metrics.accuracy_score, 
                                                            f1 = f1_multiclass,
                                                            verbose=True)

INFO:simpletransformers.classification.classification_model: Features loaded from cache at cache_dir/cached_dev_roberta_256_2_1821


Running Evaluation:   0%|          | 0/228 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Initializing WandB run for evaluation.


VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_step,2
_runtime,1
_timestamp,1609411805


0,1
_step,▁▅█
_runtime,▁▁▁
_timestamp,▁▁▁


INFO:simpletransformers.classification.classification_model:{'mcc': 0.8635984931439648, 'tp': 867, 'tn': 829, 'fp': 83, 'fn': 42, 'acc': 0.9313563975837452, 'eval_loss': 0.18159797959940574, 'f1': 0.9327595481441635}


In [None]:
predictions, raw_outputs = model.predict(["HI my friend"])
print(predictions, raw_outputs)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[1] [[-2.21484375  1.97949219]]


# Visualizations and ELI5 examples

In [None]:
!pip install transformers
!pip install -U simpletransformers
!pip install eli5

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 20.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 59.0MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 58.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=7cf58b6c085a7c6663

In [None]:
path = "/content/drive/MyDrive/Datos_fake_news/Sentiment_Analysis/SST2/models"

from simpletransformers.classification import (
    ClassificationArgs,
    ClassificationModel,
)

model_roberta = ClassificationModel("roberta", path, use_cuda=True, args={"no_cache":True})

In [None]:
model_roberta_no_cuda =  ClassificationModel("roberta", path, use_cuda=False, args={"no_cache":True})

predict_proba : callable
    Black-box classification pipeline. predict_proba
    should be a function which takes a list of strings (documents)
    and return a matrix of shape (n_samples, n_classes) with
    probability values - a row per document and a column per output
    label.

## ELI5

https://towardsdatascience.com/adding-interpretability-to-multiclass-text-classification-models-c44864e8a13b

In [None]:
import torch.nn as nn
import torch

def predict_proba(sentence):
  result,logits = model_roberta.predict(sentence)
  logits = torch.tensor(logits)
  m = nn.Softmax(dim=1)
  prob = m(logits)
  return prob
results = predict_proba(["Hi friend"])
results.shape

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([1, 2])

In [None]:
te = TextExplainer(random_state=42, n_samples = 1000)

In [None]:
import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state=42, n_samples =500)
te.fit("I will find you and I will kill you", predict_proba=predict_proba,  )
h = te.show_prediction(target_names=["Negative", "Positive"])
h

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Contribution?,Feature
0.455,<BIAS>
-0.265,Highlighted in text (sum)


In [None]:
from IPython.display import display, HTML
 
type(display(HTML(h.data)))

Contribution?,Feature
0.455,<BIAS>
-0.265,Highlighted in text (sum)


NoneType

In [None]:
print(te.metrics_)

{'mean_KL_divergence': 0.009694694194737053, 'score': 0.9967585753109094}


In [None]:
te.show_prediction()

Contribution?,Feature
1.11,hi
0.862,will
0.784,my
0.765,tonight
0.721,friend
0.619,<BIAS>
0.528,to
0.32,we
0.311,come
0.287,eat
