Now we will continue on the [Conversation AI](https://conversationai.github.io/) dataset seen in [week 6 homework and lab](https://github.com/MIDS-scaling-up/v2/tree/master/week06). 
 
The original notebook ran on a single GPU. The goal of this lab is to evaluate different options of making it run on more than one GPU
  
*Disclaimer: the dataset used contains text that may be considered profane, vulgar, or offensive.*

In [None]:
# download and prepare the datasets
# this only needs to be done once.. 
# notice where the data goes
!mkdir data
# Download the training and the test corpus
!wget -nv --show-progress -O data/test.csv.zip https://www.dropbox.com/s/xp6bo8yo1vbv5yg/test.csv.zip?dl=1
!wget -nv --show-progress -O data/train.csv.zip https://www.dropbox.com/s/xei6z41mfrcnxcd/train.csv.zip?dl=1
# Download the pretrained weights for bert base. 
!wget -nv --show-progress -O data/uncased_L-12_H-768_A-12.zip \
        https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!wget -nv --show-progress  -O data/cased_L-12_H-768_A-12.zip \
        https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
# unzip weights & conifg and remove the original zip
!unzip -d data/ data/cased_L-12_H-768_A-12.zip && rm data/cased_L-12_H-768_A-12.zip
!unzip -d data/ data/uncased_L-12_H-768_A-12.zip && rm data/uncased_L-12_H-768_A-12.zip

In [None]:
import sys, os
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

%load_ext autoreload
%autoreload 2
%matplotlib inline
# from tqdm import tqdm, tqdm_notebook
# from tqdm.notebook import trange, tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
import shutil

In [None]:
# Let's activate CUDA for GPU based operations
device=torch.device('cuda')

Change the PATH variable to whereever your `week06/hw` directory is located.  
**For the final run we would like you to have a train_size of at least 1 Million rows, and a valid size of at least 500K rows. When you first run the script, feel free to work with a reduced train and valid size for speed.** 

In [None]:
# In bert we need all inputs to have the same length, we will use the first 220 characters. 
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
# We shall run a single epoch (ie. one pass over the data)
EPOCHS = 1
PATH = '.'
#'/root/v2/week06/hw' # /root/v2/week06/hw"
DATA_DIR = os.path.join(PATH, "data")
WORK_DIR = os.path.join(PATH, "workingdir")

**Please use a small size here for experimentation purposes.  Once comfortable, we could increase it**

In [None]:
# Validation and training sizes are here.
# selected such that it won't take forever to run this 
train_size= 20000 # 1000000 
valid_size= 10000  # 500000


This should be the files you downloaded earlier when you ran `download.sh`

In [None]:
os.listdir(DATA_DIR)

We shall install pytorch BERT implementation.   
If you would like to experiment with or view any code (purely optional, and not graded :) ), you can copy the files from the repo https://github.com/huggingface/pytorch-pretrained-BERT  

In [None]:
from transformers import BertModel, BertConfig, BertTokenizer, BertForSequenceClassification, BertTokenizerFast
from transformers import AdamW as BertAdam

We shall now load the model. When you run this, comment out the `capture` command to understand the archecture.

In [None]:
# %%capture
# bert_config = BertConfig()

Now we load the BERT Fast tokenizer and convert the sentences.

In [None]:
%%time
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased',do_lower_case=True)
train_all = pd.read_csv(os.path.join(DATA_DIR, "train.csv.zip")).sample(train_size+valid_size,random_state=SEED)
print('loaded %d records' % len(train_all))

# Make sure all comment_text values are strings
train_all['comment_text'] = train_all['comment_text'].astype(str) 

In [None]:
%%time
#### sequences = convert_lines(train_all["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
# sequences = tokenizer(train_all["comment_text"].fillna("DUMMY_VALUE").tolist(), truncation=True, add_special_tokens=True, padding=True, max_length = MAX_SEQUENCE_LENGTH)
sequences = tokenizer(train_all["comment_text"].fillna("DUMMY_VALUE").tolist(), truncation=True, add_special_tokens=True, padding=True, max_length = MAX_SEQUENCE_LENGTH)
train_all=train_all.fillna(0)

In [None]:
sequences = sequences.input_ids

As it is a binary problem, we change our target to [0,1], instead of float.   
We also split the dataset into a training and validation set, 

In [None]:
train_all['target']=(train_all['target']>=0.5).astype(float)
# Training data - sentences
X = sequences[:train_size] 
# Target - the toxicity. 
y = train_all[['target']].values[:train_size]
X_val = sequences[train_size:]                
y_val = train_all[['target']].values[train_size:]

In [None]:
test_df=train_all.tail(valid_size).copy()
train_df=train_all.head(train_size)

In [None]:
# Training data dataset
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long),torch.tensor(y,dtype=torch.float))

In [None]:
y_columns=['target']
lr=2e-5
batch_size = 32
accumulation_steps=1
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=len(y_columns)).cuda()

**Here, please count the number of available GPUs. Then, if you have more than one, use torch.nn.DataParallel.  You also may want to change the batch size**

In [None]:
# fille in with code

In [None]:
# model = BertForSequenceClassification.from_pretrained(WORK_DIR,cache_dir=None,num_labels=len(y_columns))
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr)

# model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=1)

**Train your model by iterating through batches in a single epoch of the data. Note how long it takes**    

In [None]:
# from tqdm.notebook import trange, tqdm, 
# from tqdm import tnrange
from tqdm import tqdm

In [None]:
from tqdm.notebook import trange, tqdm, tqdm_notebook
from tqdm import tnrange

In [None]:
EPOCHS=1

In [None]:
%%time
model=model.train()

from torch.cuda.amp import autocast
scaler = torch.cuda.amp.GradScaler()

# tq = tqdm(range(EPOCHS))
# tq = tqdm.notebook(range(EPOCHS))

tq = trange(EPOCHS)
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
#    tk0 = tqdm_notebook.tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
    tk0 = tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    for i,(x_batch, y_batch) in tk0:
        if i % 100 == 0:
            print("batch: ", i, " of ", len(train_loader))
        with autocast():
            y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)[0]
            loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
        scaler.scale(loss).backward()
##        y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)[0]
##        loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
##        loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
#            optimizer.step()                            # Now we can do an optimizer step
#            optimizer.zero_grad()

#        if n_gpu > 1:
#            loss = loss.mean()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()

        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
#    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
        
        

**Now make a prediction for your validation set.**  

In [None]:
for param in model.parameters():
    param.requires_grad=False
model.eval()

**use DataParallel to run eval on more than GPU**

In [None]:
val_batch_size = 1024
valid_preds = np.zeros((len(X_val)))
valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(valid, batch_size=val_batch_size, shuffle=False)

tk0 = tqdm_notebook(valid_loader)
for i,(x_batch,)  in enumerate(tk0):
    pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)[0]
    valid_preds[i*val_batch_size:(i+1)*val_batch_size]=pred[:,0].detach().cpu().squeeze().numpy()

In [None]:
y_preds=torch.sigmoid(torch.tensor(valid_preds)).numpy()
print('AUC score {:.5f}'.format(roc_auc_score(test_df['target'].values, y_preds)))