<a href="https://colab.research.google.com/github/NicolasMauge/learning_projects/blob/master/Pytorch_Toxic_Comments_w_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up of the Google Colab environment

In [0]:
import torch
print(torch.__version__)

1.1.0


In [None]:
!git clone https://github.com/NVIDIA/apex.git

In [None]:
!cd apex ; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .

In [0]:
!ls

apex  data  logs  model  sample_data


In [None]:
!pip install fast-bert
!pip install pytorch-pretrained-bert

# Download of the data

Before running the next cell, please change the username (xxx) and key (yyy)

In [0]:
%%writefile kaggle.json
{"username":"xxx","key":"yyy"}

Writing kaggle.json


In [0]:
!mkdir -p ~/.kaggle/ ; mv kaggle.json ~/.kaggle/ ; chmod 600 ~/.kaggle/kaggle.json

In [0]:
!mkdir data ; cd data ; mkdir toxic_comments
!cd data ; cd toxic_comments ; kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!ls ; cd data/toxic_comments ; ls
!cd data ; cd toxic_comments ; unzip train.csv.zip ; unzip test.csv.zip ; ls

In [0]:
import pandas as pd
dat = pd.read_csv("data/toxic_comments/train.csv")
print(dat.columns)
dat.head()

In [0]:
labels_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
labels_pandas = pd.DataFrame(labels_list)
labels_pandas.to_csv("data/toxic_comments/labels.csv", index=False, header=False)

In [0]:
!cat data/toxic_comments/labels.csv

In [0]:
!cd data ; cd toxic_comments ; ls

data_train.csv	sample_submission.csv.zip  test_labels.csv.zip
data_valid.csv	test.csv		   train.csv
labels.csv	test.csv.zip		   train.csv.zip


Split train / valid (80% / 20%)

In [0]:
import numpy as np

df = pd.read_csv("data/toxic_comments/train.csv")
train, valid = np.split(df.sample(frac=1), [int(.8*len(df))])
print(len(df), len(train), len(valid))

train.to_csv("data/toxic_comments/data_train.csv", index=False)
valid.to_csv("data/toxic_comments/data_valid.csv", index=False)

159571 127656 31915


# Convert google model

## Download the model

In [None]:
!mkdir model ; cd model ; wget "https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip"
!cd model ; unzip multi_cased_L-12_H-768_A-12.zip

In [0]:
!cd model ; cd multi_cased_L-12_H-768_A-12 ; ls

bert_config.json		     bert_model.ckpt.index  pytorch_model.bin
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta   vocab.txt


## Convert the model in Pytorch

Converted model: model/multi_cased_L-12_H-768_A-12/pytorch_model.bin

In [None]:
!export BERT_BASE_DIR=model/multi_cased_L-12_H-768_A-12 ; pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch $BERT_BASE_DIR/bert_model.ckpt $BERT_BASE_DIR/bert_config.json $BERT_BASE_DIR/pytorch_model.bin

# Train the model

In [0]:
import torch 
torch.cuda.empty_cache()

In [0]:
import torch
import apex

from pytorch_pretrained_bert.tokenization import BertTokenizer
from fast_bert.data import BertDataBunch
from fast_bert.learner import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, roc_auc

from pathlib import Path


In [0]:
DATA_PATH = Path('data/toxic_comments/')     # path for data files (train and val)
LABEL_PATH = Path('data/toxic_comments/')  # path for labels file
MODEL_PATH=Path('models/')    # path for model artifacts to be stored
LOG_PATH=Path('logs/')       # path for log files to be stored

# location for the pretrained BERT models
BERT_PRETRAINED_PATH = Path('model/multi_cased_L-12_H-768_A-12/')

args = {
    "run_text": "multilabel toxic comments with freezable layers",
    "max_seq_length": 256,
    "do_lower_case": False,
    "train_batch_size": 32,
    "learning_rate": 5e-6,
    "num_train_epochs": 12.0,
    "warmup_proportion": 0.002,
    "local_rank": -1,
    "gradient_accumulation_steps": 1,
    "fp16": False,
    "loss_scale": 128
}

LOG_PATH.mkdir(exist_ok=True)

In [0]:
import logging
import datetime
import sys
import pandas as pd

pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [0]:
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_PATH, 
                                          do_lower_case=args['do_lower_case'])

06/04/2019 13:43:13 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file model/multi_cased_L-12_H-768_A-12/vocab.txt


In [0]:
device = torch.device('cuda')

# check if multiple GPUs are available
if torch.cuda.device_count() > 1:
    multi_gpu = True
else:
    multi_gpu = False

In [0]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer, 
                          train_file='data_train.csv', val_file='data_valid.csv', 
                          text_col="comment_text", label_col=label_cols,
                          bs=args['train_batch_size'], maxlen=args['max_seq_length'], 
                          multi_gpu=multi_gpu, multi_label=True)

In [0]:
## from fast_bert / learner.py
import os
from fast_bert.data import BertDataBunch, InputExample, InputFeatures
from fast_bert.modeling import BertForMultiLabelSequenceClassification
from torch.optim.lr_scheduler import _LRScheduler, Optimizer
from pytorch_pretrained_bert.optimization import BertAdam, ConstantLR, WarmupCosineSchedule, WarmupConstantSchedule, WarmupLinearSchedule, WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertLayerNorm
from fastprogress.fastprogress import master_bar, progress_bar
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc

from fastai.torch_core import *
from fastai.callback import *

class BertLearner_freeze(BertLearner):
    @staticmethod
    def from_pretrained_model(dataBunch, pretrained_path, metrics, device, logger, finetuned_wgts_path=None, 
                              multi_gpu=True, is_fp16=True, loss_scale=0, warmup_proportion=0.1, 
                              grad_accumulation_steps=1, multi_label=False):
        
        model_state_dict = None
        
        if finetuned_wgts_path:
            model_state_dict = torch.load(finetuned_wgts_path)
        
        if multi_label == True:
            model = BertForMultiLabelSequenceClassification.from_pretrained(pretrained_path, 
                                                                  num_labels = len(dataBunch.labels), 
                                                                  state_dict=model_state_dict)
        else:
            model = BertForSequenceClassification.from_pretrained(pretrained_path, 
                                                                  num_labels = len(dataBunch.labels), 
        # line added to freeze the pretrained layers 
        # ==>                                                    state_dict=model_state_dict)
        model.freeze_bert_encoder()     
        # <==
                                                             
        if is_fp16:
            model = model.half()
        
        model.to(device)
        
        if device.type == 'cuda':
            if multi_gpu == False:
                try:
                    from apex.parallel import DistributedDataParallel as DDP
                except ImportError:
                    raise ImportError("Please install apex to use distributed and fp16 training.")

                model = DDP(model)
            else:
                model = torch.nn.DataParallel(model)
            
        return BertLearner(dataBunch, model, pretrained_path, metrics, device, logger, 
                multi_gpu, is_fp16, loss_scale, warmup_proportion, grad_accumulation_steps, multi_label )

In [None]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'accuracy_single', 'function': accuracy_multilabel})

learner = BertLearner_freeze.from_pretrained_model(databunch, BERT_PRETRAINED_PATH, metrics, device, logger, 
                                            finetuned_wgts_path=None, 
                                            is_fp16=args['fp16'], loss_scale=args['loss_scale'], 
                                            multi_gpu=multi_gpu,  multi_label=True)


In [0]:
list_grad = [param.requires_grad for param in learner.model.parameters() if param.requires_grad==True]

list_grad

[True, True]

In [None]:
learner.fit(4, lr=args['learning_rate'], 
            schedule_type="warmup_cosine_hard_restarts")

# NB

(needs a confirmation) fastbert doesn't have yet the capacity to freeze the pretrained layers with an option. I created a workaround with the class BertLearner_freeze(BertLearner).
Problem: this class is incompatible with float precision 16 in apex (pb when the tensors are flatten during back-propagation). 