# Introduction

In this notebook we will train Transformers-based models with TASS data. There are three classes "P", "Neu" and "N". We will use simpletransformers and wandb to train and log the results. 

| Negative | Neutral | Positive | Total |
|----------|---------|----------|-------|
| 22205 (41%)    | 3945 (7%)   | 27962 (51%)   | 54112 |



In [None]:
!pip install --upgrade wandb
!pip install -q transformers
!pip install -q seqeval
!pip install -q tensorboardx
!pip install -U -q simpletransformers
!pip install -q tqdm

Requirement already up-to-date: wandb in /usr/local/lib/python3.6/dist-packages (0.10.12)


# Data

In [None]:
import pandas as pd
TASS_data = pd.read_pickle("/content/drive/MyDrive/Datos_fake_news/Sentiment_Analysis/TASS/TASS_data_df.pkl")
text_data = TASS_data.content.to_list()
label_data = TASS_data.polarity.to_list()

# Split data

In [None]:
from sklearn.model_selection import train_test_split

X_train_dev, X_test, y_train_dev, y_test = train_test_split(
    text_data, label_data, test_size=0.05, random_state=42, stratify =label_data)

X_train, X_dev, y_train, y_dev = train_test_split(
    X_train_dev, y_train_dev, test_size=0.02, random_state=42, stratify = y_train_dev)

In [None]:
TASS_train_df = pd.DataFrame({"text": X_train, 
                              "labels": y_train})

TASS_dev_df = pd.DataFrame({"text": X_dev, 
                              "labels": y_dev})

TASS_test_df = pd.DataFrame({"text": X_test, 
                              "labels": y_test})

In [None]:

TASS_train_df.groupby(["labels"], as_index=False).agg("count")

Unnamed: 0,labels,text
0,N,20672
1,NEU,3673
2,P,26032


In [None]:
TASS_dev_df.groupby(["labels"], as_index=False).agg("count")

Unnamed: 0,labels,text
0,N,422
1,NEU,75
2,P,532


In [None]:
TASS_test_df.groupby(["labels"], as_index=False).agg("count")

Unnamed: 0,labels,text
0,N,1111
1,NEU,197
2,P,1398


# TRAIN

In [None]:
import logging

import pandas as pd
import sklearn

import wandb
from simpletransformers.classification import (
    ClassificationArgs,
    ClassificationModel,
)

# layer_parameters = {f"layer_{i}": {"values":5e-5} for i in range(0, 12, 1)}

sweep_config = {
    "name": "BETO",
    "method": "grid",  # grid, random
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {  
                             "values" : [2] #10
        },
        "learning_rate": {
            "values":[ 6e-6] #  5e-5
        }, 
        'weight_decay': {
            'values': [0.0005, 0.005 ] # 0.05, 0.01
        },
        # "train_batch_size" : {"values":[16, 32]},
    },
    "early_terminate": {"type": "hyperband", "min_iter": 6,}
}

sweep_id = wandb.sweep(sweep_config, project="TASS-train")


# Logger
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Args
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.max_seq_length = 256
model_args.manual_seed = 4
model_args.learning_rate = 5e-5
model_args.use_multiprocessing = True
model_args.train_batch_size = 16 # Train = 63749 / 16 = 42010 batches in 1 epoch
model_args.eval_batch_size = 8
model_args.evaluate_during_training_verbose = True
model_args.evaluate_during_training = True
model_args.best_model_dir = "outputs/best_model" # lower eval_loss
model_args.evaluate_during_training_steps = 1500
model_args.save_model_every_epoch = False 
model_args.save_eval_checkpoints = False # We dont have enough space
model_args.use_cached_eval_features = True
model_args.train_custom_parameters_only = False # Train specific parameters? No, train all parameters
model_args.gradient_accumulation_steps = 2 # The number of training steps (batches) to execute before performing a optimizer.step(). More = less time 
model_args.labels_list = ["N", "NEU", "P"]
# model_args.wandb_project = "SST2-train"

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')
from sklearn.metrics import f1_score, accuracy_score,  matthews_corrcoef, cohen_kappa_score

def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel(
        'bert', 
        "dccuchile/bert-base-spanish-wwm-uncased",
        use_cuda=False,
        args=model_args,
        num_labels=3, 
        sweep_config=wandb.config,
    )

    # Train the model and evalaute. Select best model according to evalaution
    model.train_model(TASS_train_df, 
                      eval_df=TASS_dev_df,  
                      acc=sklearn.metrics.accuracy_score,
                      f1 = f1_multiclass,
                      mcc = sklearn.metrics.matthews_corrcoef,
                      kappa = cohen_kappa_score
                      )

    # Evaluate the model
    # model.eval_model(sst2_df_test, acc=sklearn.metrics.accuracy_score, 
                    #  verbose=True)

    # Sync wandb
    wandb.join()


wandb.agent(sweep_id, train)

In [None]:
import wandb
from simpletransformers.classification import (
    ClassificationArgs,
    ClassificationModel,
)

In [None]:
import logging

import pandas as pd
import sklearn


sweep_config = {
    "method": "grid",  # grid, random
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {"values": [1]},
        "learning_rate": {"values":[5e-5]},
    },
}

# sweep_id = wandb.sweep(sweep_config, project="Simple Sweep")
sweep_id = wandb.sweep(sweep_config, project="Hola")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_data = [
    ["Aragorn was the heir of Isildur", "true"],
    ["Frodo was the heir of Isildur", "false"],
]
train_df = pd.DataFrame(train_data)
train_df.columns = ["text", "labels"]

# Preparing eval data
eval_data = [
    ["Theoden was the king of Rohan", "true"],
    ["Merry was the king of Rohan", "false"],
]
eval_df = pd.DataFrame(eval_data)
eval_df.columns = ["text", "labels"]

model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 16
model_args.eval_batch_size = 8
model_args.labels_list = ["true", "false"]
model_args.wandb_project = "Simple Sweep"

def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel(
        "roberta",
        "roberta-base",
        use_cuda=False,
        args=model_args,
        sweep_config=wandb.config,
    )

    # Train the model
    model.train_model(train_df, eval_df=eval_df)

    # Evaluate the model
    model.eval_model(eval_df)

    # Sync wandb
    wandb.join()


wandb.agent(sweep_id, train)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
INFO:wandb.agents.pyagent:Starting sweep agent: entity=None, project=None, count=None


Create sweep with ID: rlyl5w3p
Sweep URL: https://wandb.ai/huertas_97/Hola/sweeps/rlyl5w3p


[34m[1mwandb[0m: Agent Starting Run: p0ba3z24 with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	num_train_epochs: 1
[34m[1mwandb[0m: Currently logged in as: [33mhuertas_97[0m (use `wandb login --relogin` to force relogin)


INFO:filelock:Lock 139764250354688 acquired on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

INFO:filelock:Lock 139764250354688 released on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock
INFO:filelock:Lock 139761839360544 acquired on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

INFO:filelock:Lock 139761839360544 released on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of R

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

INFO:filelock:Lock 139761804319264 released on /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO:filelock:Lock 139761804319264 acquired on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

INFO:filelock:Lock 139761804319264 released on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'tp': 0, 'tn': 1, 'fp': 0, 'fn': 1, 'eval_loss': 0.693641185760498}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.96281918871…

0,1
global_step,1.0
tp,0.0
tn,1.0
fp,0.0
fn,1.0
mcc,0.0
train_loss,0.65265
eval_loss,0.69364
_step,3.0
_runtime,57.0


0,1
global_step,▁
tp,▁
tn,▁
fp,▁
fn,▁
mcc,▁
train_loss,▁
eval_loss,▁
_step,▁▃▆█
_runtime,▁███


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [None]:
distilmodel = ClassificationModel('roberta', '/content/outputs/best_model',
                                  use_cuda = False
                                          
                                          )

In [None]:
!pip install httplib2==0.15.0

!pip install google-api-python-client==1.6

Collecting httplib2==0.15.0
[?25l  Downloading https://files.pythonhosted.org/packages/be/83/5e006e25403871ffbbf587c7aa4650158c947d46e89f2d50dcaf018464de/httplib2-0.15.0-py3-none-any.whl (94kB)
[K     |███▌                            | 10kB 13.6MB/s eta 0:00:01[K     |███████                         | 20kB 19.1MB/s eta 0:00:01[K     |██████████▍                     | 30kB 9.0MB/s eta 0:00:01[K     |█████████████▉                  | 40kB 3.1MB/s eta 0:00:01[K     |█████████████████▎              | 51kB 3.8MB/s eta 0:00:01[K     |████████████████████▊           | 61kB 4.4MB/s eta 0:00:01[K     |████████████████████████▏       | 71kB 4.5MB/s eta 0:00:01[K     |███████████████████████████▋    | 81kB 5.0MB/s eta 0:00:01[K     |███████████████████████████████ | 92kB 5.2MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 4.0MB/s 
[31mERROR: google-api-python-client 1.7.12 has requirement httplib2<1dev,>=0.17.0, but you'll have httplib2 0.15.0 which is incompa

Collecting google-api-python-client==1.6
[?25l  Downloading https://files.pythonhosted.org/packages/ff/f9/89fa38198e8f2cdbb62f334545496e14f12b75075f56f8a3e7b487497997/google_api_python_client-1.6.0-py2.py3-none-any.whl (52kB)
[K     |██████▎                         | 10kB 14.8MB/s eta 0:00:01[K     |████████████▌                   | 20kB 18.7MB/s eta 0:00:01[K     |██████████████████▉             | 30kB 11.6MB/s eta 0:00:01[K     |█████████████████████████       | 40kB 6.6MB/s eta 0:00:01[K     |███████████████████████████████▍| 51kB 4.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.0MB/s 
[31mERROR: firebase-admin 4.4.0 has requirement google-api-python-client>=1.7.8, but you'll have google-api-python-client 1.6.0 which is incompatible.[0m
Installing collected packages: google-api-python-client
  Found existing installation: google-api-python-client 1.7.12
    Uninstalling google-api-python-client-1.7.12:
      Successfully uninstalled google-api-pytho

In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once in a notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a text file.
uploaded = drive.CreateFile({'title': 'model_prueba.zip'})
uploaded.SetContentFile('model_prueba.zip')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1MJrpErQxnu8duKFM9_8wLrwDjxmkxDNl
