In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import  FlaubertTokenizer
from transformers import PreTrainedTokenizer
from transformers import PreTrainedModel,FlaubertWithLMHeadModel,AdamW,get_linear_schedule_with_warmup
from typing import Dict, List, Tuple
import logging
from tqdm import tqdm, trange
import random
import os
import numpy as np


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter
    
block_size=512
from torch.nn.utils.rnn import pad_sequence

In [2]:
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

    if tokenizer.mask_token is None:
        raise ValueError(
            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
        )

    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, 0.15)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [3]:
import pandas as pd 
import re
def load_data(filepath):
    # Load the dataset into a pandas dataframe.
    df = pd.read_csv(filepath)
    #df['text_clean']=df['text_clean'].astype(str)
    #df['text_clean']= df['text_clean'].map(lambda text_clean : re.sub('["#$%&()*+,-./:;<=>@[\]^_`{|}~\n\t’\']', ' ', text_clean))
    # Report the number of sentences.
    print('Number of sentences: {:,}\n'.format(df.shape[0]))
    return df

df1=load_data('/data/aboumada/Data/3_Datasets/full_df_Features.csv')
df2=load_data('/data/aboumada/Data/3_Datasets/all_unlabeled.txt')

Number of sentences: 12,826

Number of sentences: 318,642



  if (await self.run_code(code, result,  async_=asy)):


In [4]:
#df=df1
df=df1.append(df2,ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
df["text_clean"]

0         l'aude en alerte rouge aux inondations : cinq ...
1         irma est ainsi devenue « un ouragan extrêmemen...
2         les partiels à l'ulco c'est inondation, coupur...
3         le mot solidarité on corse il existe encore ✊✊...
4         je pense réellement que l'on nous cache la vér...
                                ...                        
331463    philippines : la tempête tropicale tembin a fa...
331464    bruno qui m'envoie un message pour nouvel an p...
331465    ouragan irma: ottawa ne volera pas au secours ...
331466          wow vive la tempête putain j'ai pas partiel
331467    #chambery @villedechambery @chamberymetro  ale...
Name: text_clean, Length: 331468, dtype: object

In [6]:
tokenizer = FlaubertTokenizer.from_pretrained("flaubert-base-cased")
lines = df["text_clean"].astype(str).values.tolist()

In [7]:
#train_dataset = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

In [8]:
logger = logging.getLogger(__name__)
def set_seed(args):
    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(0)

In [9]:
class TextDataset():
    def __init__(self,lines,tokenizer: PreTrainedTokenizer, block_size=512):
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s")


        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

In [10]:
train_dataset=TextDataset(lines,tokenizer)

In [11]:
def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [12]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(torch.device("cuda:0"))
            labels = labels.to(torch.device("cuda:0"))
            model.train()
            outputs = model(inputs, labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                '''if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss'''

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0] and args.save_steps > 0 :
        checkpoint_prefix = "checkpoint"
        # Save model checkpoint
        output_dir = args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to %s", output_dir)
        _rotate_checkpoints(args, checkpoint_prefix)
        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
        logger.info("Saving optimizer and scheduler states to %s", output_dir)    
    
    
    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step


In [14]:
args = {
       'train_batch_size': 3,
       'per_gpu_train_batch_size':8,
       'max_steps':-1,
       'num_train_epochs':4.0,
       'local_rank':-1 ,
        'n_gpu':2,
       'gradient_accumulation_steps':1,
        'weight_decay' : 0.0,
        'learning_rate' : 5e-5,
        'adam_epsilon':1e-8,
        'warmup_steps':0,
        'seed':0,
        'mlm':0.15,
        'max_grad_norm':1.0,
        'logging_steps':500,
        'save_steps':500,
        'evaluate_during_training':True ,
        'output_dir':'flaubert_fine_tuned_alldata',
        'save_total_limit':None,
        'fp16':True,
        'fp16_opt_level':"O1"
    }
args = pd.Series(args) 

In [15]:
model = FlaubertWithLMHeadModel.from_pretrained("flaubert-base-cased")
model.cuda()

FlaubertWithLMHeadModel(
  (transformer): FlaubertModel(
    (position_embeddings): Embedding(512, 768)
    (embeddings): Embedding(68729, 768, padding_idx=2)
    (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (attentions): ModuleList(
      (0): MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bias=True)
        (k_lin): Linear(in_features=768, out_features=768, bias=True)
        (v_lin): Linear(in_features=768, out_features=768, bias=True)
        (out_lin): Linear(in_features=768, out_features=768, bias=True)
      )
      (1): MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bias=True)
        (k_lin): Linear(in_features=768, out_features=768, bias=True)
        (v_lin): Linear(in_features=768, out_features=768, bias=True)
        (out_lin): Linear(in_features=768, out_features=768, bias=True)
      )
      (2): MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bia

In [None]:
train(args, train_dataset, model , tokenizer) 

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]
Iteration:   0%|          | 0/20717 [00:00<?, ?it/s][A

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic



Iteration:   0%|          | 1/20717 [00:02<12:10:04,  2.11s/it][A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0



Iteration:   0%|          | 2/20717 [00:02<9:13:11,  1.60s/it] [A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0



Iteration:   0%|          | 3/20717 [00:03<7:17:19,  1.27s/it][A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0



Iteration:   0%|          | 4/20717 [00:03<5:53:05,  1.02s/it][A
Iteration:   0%|          | 5/20717 [00:03<4:56:52,  1.16it/s][A
Iteration:   0%|          | 6/20717 [00:04<4:15:05,  1.35it/s][A
Iteration:   0%|          | 7/20717 [00:04<3:44:50,  1.54it/s][A
Iteration:   0%|          | 8/20717 [00:05<3:24:53,  1.68it/s][A
Iteration:   0%|          | 9/20717 [00:05<3:10:52,  1.81it/s][A
Iteration:   0%|          | 10/20717 [00:06<3:02:52,  1.89it/s][A
Iteration:   0%|          | 11/20717 [00:06<2:55:09,  1.97it/s][A
Iteration:   0%|          | 12/20717 [00:07<2:53:33,  1.99it/s][A
Iteration:   0%|          | 13/20717 [00:07<2:51:04,  2.02it/s][A
Iteration:   0%|          | 14/20717 [00:08<2:49:17,  2.04it/s][A
Iteration:   0%|          | 15/20717 [00:08<2:45:34,  2.08it/s][A
Iteration:   0%|          | 16/20717 [00:09<2:43:44,  2.11it/s][A
Iteration:   0%|          | 17/20717 [00:09<2:43:39,  2.11it/s][A
Iteration:   0%|          | 18/20717 [00:09<2:40:04,  2.16it/s][A


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0



Iteration:   0%|          | 20/20717 [00:10<2:43:17,  2.11it/s][A
Iteration:   0%|          | 21/20717 [00:11<2:45:54,  2.08it/s][A
Iteration:   0%|          | 22/20717 [00:11<2:42:44,  2.12it/s][A
Iteration:   0%|          | 23/20717 [00:12<2:43:51,  2.10it/s][A
Iteration:   0%|          | 24/20717 [00:12<2:44:08,  2.10it/s][A
Iteration:   0%|          | 25/20717 [00:13<2:44:27,  2.10it/s][A
Iteration:   0%|          | 26/20717 [00:13<2:44:04,  2.10it/s][A
Iteration:   0%|          | 27/20717 [00:14<2:44:40,  2.09it/s][A
Iteration:   0%|          | 28/20717 [00:14<2:47:28,  2.06it/s][A
Iteration:   0%|          | 29/20717 [00:15<2:48:43,  2.04it/s][A
Iteration:   0%|          | 30/20717 [00:15<2:47:15,  2.06it/s][A
Iteration:   0%|          | 31/20717 [00:16<2:44:25,  2.10it/s][A
Iteration:   0%|          | 32/20717 [00:16<2:44:01,  2.10it/s][A
Iteration:   0%|          | 33/20717 [00:17<2:39:53,  2.16it/s][A
Iteration:   0%|          | 34/20717 [00:17<2:37:35,  2.19it/

In [None]:
#model2 = BertForMaskedLM.from_pretrained("fine_tuned_bert_ml")

In [None]:
#!cp fine_tuned_bert_ml/checkpoint-500/* fine_tuned_bert_ml/

In [None]:
#from transformers import FlaubertForSequenceClassification
#model3=FlaubertForSequenceClassification.from_pretrained("flaubert_fine_tuned")

In [None]:
#model3.cuda()

In [None]:
#!rm -rf 