#Deep learning challenge using BERT fine Tuning sentence classification

## Import and installation

In [0]:
# Install Kaggle library
!pip install -q kaggle

In [2]:
# Install transformers library from huggingface
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |▊                               | 10kB 20.5MB/s eta 0:00:01[K     |█▌                              | 20kB 26.3MB/s eta 0:00:01[K     |██▏                             | 30kB 28.9MB/s eta 0:00:01[K     |███                             | 40kB 31.6MB/s eta 0:00:01[K     |███▋                            | 51kB 33.8MB/s eta 0:00:01[K     |████▍                           | 61kB 36.6MB/s eta 0:00:01[K     |█████▏                          | 71kB 34.3MB/s eta 0:00:01[K     |█████▉                          | 81kB 35.1MB/s eta 0:00:01[K     |██████▋                         | 92kB 31.2MB/s eta 0:00:01[K     |███████▎                        | 102kB 31.6MB/s eta 0:00:01[K     |████████                        | 112kB 31.6MB/s eta 0:00:01[K     |████████▉                       | 

In [3]:
# Colab library to upload files to notebook
from google.colab import files

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
import torch

from transformers import BertTokenizer

from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

import time
import datetime
import random

Using TensorFlow backend.


In [4]:
uploaded = files.upload() #Upload kaggle.json file

Saving kaggle.json to kaggle.json


In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle competitions download -c efreiparisdeeplearning2019

Downloading example_submission_test.csv to /content
  0% 0.00/315k [00:00<?, ?B/s]
100% 315k/315k [00:00<00:00, 47.0MB/s]
Downloading dataset_train.csv.zip to /content
 33% 9.00M/27.4M [00:00<00:01, 10.9MB/s]
100% 27.4M/27.4M [00:00<00:00, 35.2MB/s]
Downloading dataset_test_no_labels.csv.zip to /content
  0% 0.00/1.34M [00:00<?, ?B/s]
100% 1.34M/1.34M [00:00<00:00, 89.2MB/s]


In [0]:
# Import the test and train datasets into pandas dataframe
df_train = pd.read_csv('dataset_train.csv.zip', compression='zip', sep='\t')
df_test = pd.read_csv('dataset_test_no_labels.csv.zip', compression='zip', sep='\t')

## Data processing

In [7]:
df_train.head()

Unnamed: 0,index,sentence_1,sentence_2,label
0,0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,neutral
1,1,you know during the season and i guess at at y...,You lose the things to the following level if ...,entailment
2,2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,entailment
3,3,How do you know? All this is their information...,This information belongs to them.,entailment
4,4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,neutral


In [8]:
df_train.columns

Index(['index', 'sentence_1', 'sentence_2', 'label'], dtype='object')

In [9]:
i = 1
for x in df_train.columns:
  print(x, df_train[x][i])

index 1
sentence_1 you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him
sentence_2 You lose the things to the following level if the people recall.
label entailment


In [10]:
df_train.label.unique()

array(['neutral', 'entailment', 'contradiction'], dtype=object)

In [11]:
df_train.groupby(['label'])['label'].count()

label
contradiction    130889
entailment       130886
neutral          130887
Name: label, dtype: int64

In [12]:
label_encoder = LabelEncoder()
df_train.label = label_encoder.fit_transform(df_train.label)

df_train.label[0]

2

In [0]:
sentences_1 = df_train.sentence_1.values
sentences_2 = df_train.sentence_2.values
labels = df_train.label.values

In [0]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [16]:
print(' Original: ', sentences_1[0])
print('Tokenized: ', tokenizer.tokenize(sentences_1[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_1[0])))

 Original:  Conceptually cream skimming has two basic dimensions - product and geography.
Tokenized:  ['conceptual', '##ly', 'cream', 'ski', '##mming', 'has', 'two', 'basic', 'dimensions', '-', 'product', 'and', 'geography', '.']
Token IDs:  [17158, 2135, 6949, 8301, 25057, 2038, 2048, 3937, 9646, 1011, 4031, 1998, 10505, 1012]


In [17]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids_1 = []
MAX_LEN = 64

for sent in sentences_1:
    encoded_sent = tokenizer.encode(
                        sent,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]', CLS to the start and SEP to the end 
                        max_length = MAX_LEN,      # Truncate all sentences to 64.
                   )
    input_ids_1.append(encoded_sent)

print('Original: ', sentences_1[0])
print('Token IDs:', input_ids_1[0])

Original:  Conceptually cream skimming has two basic dimensions - product and geography.
Token IDs: [101, 17158, 2135, 6949, 8301, 25057, 2038, 2048, 3937, 9646, 1011, 4031, 1998, 10505, 1012, 102]


In [0]:
# Pad and truncate at the end of the sequence
input_ids_1 = pad_sequences(input_ids_1, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print('Padded Token IDs:', input_ids_1[0])

In [20]:
# Do the same with the second sentence
input_ids_2 = []

for sent in sentences_2:

    encoded_sent = tokenizer.encode(
                        sent,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]', CLS to the start and SEP to the end 
                        max_length = MAX_LEN,      # Truncate all sentences to 64.
                   )
    input_ids_2.append(encoded_sent)

print('Original: ', sentences_2[0])
print('Token IDs:', input_ids_2[0])

Original:  Product and geography are what make cream skimming work. 
Token IDs: [101, 4031, 1998, 10505, 2024, 2054, 2191, 6949, 8301, 25057, 2147, 1012, 102]


In [21]:
# Pad and truncate at the end of the sequence
input_ids_2 = pad_sequences(input_ids_2, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print('Padded Token IDs:', input_ids_2[0])

Padded Token IDs: [  101  4031  1998 10505  2024  2054  2191  6949  8301 25057  2147  1012
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [22]:
#Concatenate our two padded tokenized sentence in order to have only one input for our model
input_ids = np.concatenate((input_ids_1, input_ids_2), axis=1)
print('Our input:', input_ids[0])

Our input: [  101 17158  2135  6949  8301 25057  2038  2048  3937  9646  1011  4031
  1998 10505  1012   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0   101  4031  1998 10505  2024  2054  2191  6949
  8301 25057  2147  1012   102     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [0]:
# Create attention masks
attention_masks = []

for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent] # Set 0 for padding token and 1 for real token
    attention_masks.append(att_mask)

In [0]:
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=42, test_size=0.1)
# Same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=42, test_size=0.1)

In [0]:
# Convert all inputs and labels into torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [0]:
# DataLoader 
batch_size = 32

# DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


## Training model 

In [0]:
# Use GPU instead of CPU if we have one available
if torch.cuda.is_available():      
    device = torch.device("cuda")
    print('%d GPU available.' % torch.cuda.device_count())
else:
    print('No GPU available')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [27]:
# Load BertForSequenceClassification for sentence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",          # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3,               # The number of output labels  
    output_attentions = False,    # Not returns attentions weights.
    output_hidden_states = False, # Not returns all hidden-states.
)

# Run the model on the GPU.
model.cuda()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [28]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [0]:
# Use AdamW, the class from the huggingface library 
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate
                  eps = 1e-8 # default value
                )


In [0]:
# Training epochs 
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [0]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [0]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0 #Reset loss value for the next epoch

    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:        # Progress update every 40 batches.
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad() # Clear previously calculated gradients     

        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader) # Calculate the average loss over the training data.           
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():        

            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1
        
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")



Training...
  Batch    40  of  11,044.    Elapsed: 0:00:18.
  Batch    80  of  11,044.    Elapsed: 0:00:36.
  Batch   120  of  11,044.    Elapsed: 0:00:54.
  Batch   160  of  11,044.    Elapsed: 0:01:12.
  Batch   200  of  11,044.    Elapsed: 0:01:30.
  Batch   240  of  11,044.    Elapsed: 0:01:48.
  Batch   280  of  11,044.    Elapsed: 0:02:06.
  Batch   320  of  11,044.    Elapsed: 0:02:24.
  Batch   360  of  11,044.    Elapsed: 0:02:42.
  Batch   400  of  11,044.    Elapsed: 0:03:00.
  Batch   440  of  11,044.    Elapsed: 0:03:18.
  Batch   480  of  11,044.    Elapsed: 0:03:36.
  Batch   520  of  11,044.    Elapsed: 0:03:54.
  Batch   560  of  11,044.    Elapsed: 0:04:12.
  Batch   600  of  11,044.    Elapsed: 0:04:30.
  Batch   640  of  11,044.    Elapsed: 0:04:48.
  Batch   680  of  11,044.    Elapsed: 0:05:06.
  Batch   720  of  11,044.    Elapsed: 0:05:24.
  Batch   760  of  11,044.    Elapsed: 0:05:42.
  Batch   800  of  11,044.    Elapsed: 0:06:00.
  Batch   840  of  11,044. 

In [0]:
#Plot val loss
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(loss_values, 'b-o')
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

In [0]:
### Save the model
model.save_pretrained('C:\\Users\\jujub\\Documents\\DeepLearnning Project\\my_saved_model_directory' )
tokenizer.save_pretrained('C:\\Users\\jujub\\Documents\\my_saved_model_directory\\')

In [0]:
### Reload the model and the tokenizer
model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')

## Prediction of the labels for the Test Dataset

In [0]:
# Create sentence and label lists
sentences_1 = df_test.sentence_1.values
sentences_2 = df_test.sentence_2.values

input_ids_1 = []

for sent in sentences_1:

    encoded_sent = tokenizer.encode(
                        sent,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_LEN,
                   )
    input_ids_1.append(encoded_sent)

# Pad our input tokens
input_ids_1 = pad_sequences(input_ids_1, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Same for the second sentence
input_ids_2 = []

for sent in sentences_2:

    encoded_sent = tokenizer.encode(
                        sent,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_LEN,
                   )
    input_ids_2.append(encoded_sent)

# Pad our input tokens
input_ids_2 = pad_sequences(input_ids_2, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Concatenate the two padded tokenized sentence to have only one input
input_ids = np.concatenate((input_ids_1, input_ids_2), axis=1)

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

# Set the batch size.  
batch_size = 32  

# DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


In [0]:
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

predictions = []


for batch in prediction_dataloader:

  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask = batch

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()

  predictions.append(logits)

print('    DONE.')


In [0]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
print(flat_predictions)

In [0]:
# Take back the real label 
flat_predictions = label_encoder.inverse_transform(flat_predictions)
print(flat_predictions)

In [0]:
my_submission = pd.DataFrame({'index': test_X.index, 'label': flat_predictions})
my_submission.to_csv('submission.csv', index=False)
print(my_submission.shape)
my_submission.head()

In [0]:
files.download('submission.csv')
