In [1]:
import torch
import pandas as pd
from transformers import CanineTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, WeightedRandomSampler, SequentialSampler,RandomSampler
from transformers import CanineForSequenceClassification, AdamW, BertConfig
# from transformers import get_linear_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import numpy as np
import time
import datetime
import random
import re
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset
from tqdm.auto import tqdm

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1650


In [3]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", "", post)
    # post = re.sub(r"[^a-zA-Z ]", "", post)
    # post = re.sub(r"\b\w{1,3}\b", " ", post)
    return post

In [13]:
dataset = load_dataset("yelp_review_full")
train = dataset['train']
train = train.shuffle(seed=42)[:]
test = dataset['test']
test = test.shuffle(seed=42)[:]
num_labels = len(set(train['label']))
num_labels

Reusing dataset d_bpedia14 (C:\Users\KAWSHIK\.cache\huggingface\datasets\d_bpedia14\dbpedia_14\2.0.0\7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e)
100%|██████████| 2/2 [00:00<00:00, 105.28it/s]
Loading cached shuffled indices for dataset at C:\Users\KAWSHIK\.cache\huggingface\datasets\d_bpedia14\dbpedia_14\2.0.0\7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e\cache-4f9bd4edcc24be65.arrow
Loading cached shuffled indices for dataset at C:\Users\KAWSHIK\.cache\huggingface\datasets\d_bpedia14\dbpedia_14\2.0.0\7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e\cache-d75e0d932d1c3620.arrow


14

In [None]:
# import csv
# data = pd.read_csv('reddit_dataset/reddit_dataset.csv')[['post','mental_disorder']]
# data = shuffle(data)
# # data = data[:500]
# # Class split stats
# print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
# X = data['post'].apply(lambda post: clean_post(post))
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(np.array(data['mental_disorder']))
# num_labels = 15

In [5]:
# Load the BERT tokenizer.
print('Loading Canine tokenizer...')
tokenizer = CanineTokenizer.from_pretrained('google/canine-s', do_lower_case=True)


Loading Canine tokenizer...


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [6]:
max_len = 0

# For every sentence...
for ind,sent in enumerate(train['text']):
    if ind % 10000 == 0:
        print(ind)
        
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    # input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(sent))

print('Max sentence length: ', max_len)

281633it [00:00, 1210006.82it/s]

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000

560000it [00:00, 1449109.11it/s]


300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
Max sentence length:  13561





In [14]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in train['text']:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        padding = 'max_length',
                        return_attention_mask = True, # Construct attn. masks.
                        truncation=True, 
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train['label'],dtype=torch.long)

# Print sentence 0, now as a list of IDs.
print('Original: ', train['text'][1])
print('Token IDs:', input_ids[1])

100%|██████████| 1000/1000 [00:00<00:00, 1622.76it/s]

Original:   The American Letter Mail Company was started by Lysander Spooner in 1844 competing with the legal monopoly of the United States Post Office (USPO now the USPS) in violation of the Private Express Statutes. It succeeded in delivering mail for lower prices but the U.S. Government challenged Spooner with legal measures eventually forcing him to cease operations in 1851.
Token IDs: tensor([57344,    32,    84,  ...,     0,     0,     0])





In [15]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.7 * len(dataset))
val_size =  int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset,test_dataset = random_split(dataset, [train_size, val_size,test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

  700 training samples
  150 validation samples


In [None]:
# labels = train_dataset[:][2]
# labels = shuffle(labels).long()
# class_count = [torch.sum(labels == i) for i in range(num_labels)]
# class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
# class_weights_all = class_weights[labels]
# weighted_sampler = WeightedRandomSampler(
#     weights=class_weights_all,
#     num_samples=len(class_weights_all),
#     replacement=True
# )

In [16]:

batch_size = 32


train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )


In [17]:
 
model = CanineForSequenceClassification.from_pretrained(
    "google/canine-s", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = num_labels, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
torch.cuda.empty_cache()
# model = torch.nn.DataParallel(model)
model.to(device)

Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CanineForSequenceClassification(
  (canine): CanineModel(
    (char_embeddings): CanineEmbeddings(
      (HashBucketCodepointEmbedder_0): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_1): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_2): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_3): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_4): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_5): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_6): Embedding(16384, 96)
      (HashBucketCodepointEmbedder_7): Embedding(16384, 96)
      (char_position_embeddings): Embedding(16384, 768)
      (token_type_embeddings): Embedding(16, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (initial_char_encoder): CanineEncoder(
      (layer): ModuleList(
        (0): CanineLayer(
          (attention): CanineAttention(
            (self): CanineSelfAttention(
            

In [18]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 4


total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [19]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [20]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
def get_metrics(y_true, y_pred,epoch):
    f = open(f'report{epoch}.txt','w+')
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    f.write('\nClassification Report: \n')
    f.write(result1)
    # print(type(result1))
    # df = pd.DataFrame(result1).transpose()
    # df.to_csv('report.csv')
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")
    f.write('\nAccuracy Report: \n')
    f.write(str(result2))

In [22]:
def save_model(epoch):
    import os

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if epoch < 1:
        return
    output_dir = f'./model_save_epoch{epoch}/'

    # Create output directory if needed
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Good practice: save your training arguments together with the trained model
    # torch.save(args, os.path.join(output_dir, 'training_args.bin'))
    torch.save(model, 'best-model.pt') 

In [23]:
def test_eval(epoch):
    print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
    model.eval()
    predictions ,predictions_final, true_labels = [], [], []
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        predictions_final += np.argmax(logits, axis=1).flatten().tolist()
        predictions += logits.tolist()
        true_labels += label_ids.tolist()
  
    get_metrics(true_labels,predictions_final,epoch)
    print('    DONE.')

In [24]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []
total_t0 = time.time()
valid_loss_min = 1e9

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()       
        outputs = model(b_input_ids,attention_mask=b_input_mask,labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids,attention_mask=b_input_mask,labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    test_eval(epoch_i)
    save_model(epoch_i)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.69 GiB already allocated; 0 bytes free; 2.74 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF