# Imports

In [1]:
import numpy as np
import pandas as pd
import torch.nn.functional as F
from transformers import BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, random_split
import gc
import torch
import re
import time
import datetime
import os
root_dir = os.path.abspath(os.curdir)
subs_dir = os.path.dirname(root_dir)+"/ExtractedSubmissions/"
model_dir = os.path.dirname(root_dir)+"/Models/pytorchmodel/"
#Create directory to save predictions
pred_dir = os.path.dirname(root_dir)+"/SubmissionPredictions/"
os.makedirs(pred_dir, exist_ok=True)

# Functions and helpers

In [2]:
max_len = 512
batch_size = 128
def good_update_interval(total_iters, num_desired_updates):
    '''
    This function will try to pick an intelligent progress update interval 
    based on the magnitude of the total iterations.

    Parameters:
      `total_iters` - The number of iterations in the for-loop.
      `num_desired_updates` - How many times we want to see an update over the 
                              course of the for-loop.
    '''
    # Divide the total iterations by the desired number of updates. Most likely
    # this will be some ugly number.
    exact_interval = total_iters / num_desired_updates

    # The `round` function has the ability to round down a number to, e.g., the
    # nearest thousandth: round(exact_interval, -3)
    #
    # To determine the magnitude to round to, find the magnitude of the total,
    # and then go one magnitude below that.

    # Get the order of magnitude of the total.
    order_of_mag = len(str(total_iters)) - 1

    # Our update interval should be rounded to an order of magnitude smaller. 
    round_mag = order_of_mag - 1

    # Round down and cast to an int.
    update_interval = int(round(exact_interval, -round_mag))

    # Don't allow the interval to be zero!
    if update_interval == 0:
        update_interval = 1

    return update_interval

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def clean(text, newline=True, quote=True, bullet_point=True, 
          link=True, strikethrough=True, spoiler=True,
          code=True, superscript=True, table=True, heading=True):
    """
    Cleans text (string).
    Removes common Reddit special characters/symbols:
      * \n (newlines)
      * &gt; (> quotes)
      * * or &amp;#x200B; (bullet points)
      * []() (links)
      * etc (see below)
    Specific removals can be turned off, but everything is on by default.
    Standard punctuation etc is deliberately not removed, can be done in a
    second round manually, or may be preserved in any case.
    """
    # Newlines (replaced with space to preserve cases like word1\nword2)
    if newline:
        text = re.sub(r'\n+', ' ', text)

        # Remove resulting ' '
        text = text.strip()
        text = re.sub(r'\s\s+', ' ', text)

    # > Quotes
    if quote:
        text = re.sub(r'\"?\\?&?gt;?', '', text)

    # Bullet points/asterisk (bold/italic)
    if bullet_point:
        text = re.sub(r'\*', '', text)
        text = re.sub('&amp;#x200B;', '', text)

    # []() Link (Also removes the hyperlink)
    if link:
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    # Strikethrough
    if strikethrough:
        text = re.sub('~', '', text)

    # Spoiler, which is used with < less-than (Preserves the text)
    if spoiler:
        text = re.sub('&lt;', '', text)
        text = re.sub(r'!(.*?)!', r'\1', text)

    # Code, inline and block
    if code:
        text = re.sub('`', '', text)

    # Superscript (Preserves the text)
    if superscript:
        text = re.sub(r'\^\((.*?)\)', r'\1', text)

    # Table
    if table:
        text = re.sub(r'\|', ' ', text)
        text = re.sub(':-', '', text)

    # Heading
    if heading:
        text = re.sub('#', '', text)
    return text

def make_smart_batches(text_samples, labels, batch_size):
    '''
    This function combines all of the required steps to prepare batches.
    '''

    print('Creating Smart Batches from {:,} examples with batch size {:,}...\n'.format(len(text_samples), batch_size))

    # =========================
    #   Tokenize & Truncate
    # =========================
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    full_input_ids = []

    # Tokenize all training examples
    print('Tokenizing {:,} samples...'.format(len(labels)))

    # Choose an interval on which to print progress updates.
    update_interval = good_update_interval(total_iters=len(labels), num_desired_updates=10)

    # For each training example...
    for text in text_samples:
        
        # Report progress.
        if ((len(full_input_ids) % update_interval) == 0):
            print('  Tokenized {:,} samples.'.format(len(full_input_ids)))

        # Tokenize the sample.
        input_ids = tokenizer.encode(text=text,              # Text to encode.
                                    add_special_tokens=True, # Do add specials.
                                    max_length=max_len,      # Do Truncate!
                                    truncation=True,         # Do Truncate!
                                    padding=False)           # DO NOT pad.
                                    
        # Add the tokenized result to our list.
        full_input_ids.append(input_ids)
        
    print('DONE.')
    print('{:>10,} samples\n'.format(len(full_input_ids)))

    # =========================
    #      Select Batches
    # =========================    

    # Sort the two lists together by the length of the input sequence.
    samples = sorted(zip(full_input_ids, labels), key=lambda x: len(x[0]))

    print('{:>10,} samples after sorting\n'.format(len(samples)))

    import random

    # List of batches that we'll construct.
    batch_ordered_sentences = []
    batch_ordered_labels = []

    print('Creating batches of size {:}...'.format(batch_size))

    # Choose an interval on which to print progress updates.
    update_interval = good_update_interval(total_iters=len(samples), num_desired_updates=10)
    
    # Loop over all of the input samples...    
    while len(samples) > 0:
        
        # Report progress.
        if ((len(batch_ordered_sentences) % update_interval) == 0 \
            and not len(batch_ordered_sentences) == 0):
            print('  Selected {:,} batches.'.format(len(batch_ordered_sentences)))

        # `to_take` is our actual batch size. It will be `batch_size` until 
        # we get to the last batch, which may be smaller. 
        to_take = min(batch_size, len(samples))

        # Pick a random index in the list of remaining samples to start
        # our batch at.
        select = random.randint(0, len(samples) - to_take)

        # Select a contiguous batch of samples starting at `select`.
        #print("Selecting batch from {:} to {:}".format(select, select+to_take))
        batch = samples[select:(select + to_take)]

        #print("Batch length:", len(batch))

        # Each sample is a tuple--split them apart to create a separate list of 
        # sequences and a list of labels for this batch.
        batch_ordered_sentences.append([s[0] for s in batch])
        batch_ordered_labels.append([s[1] for s in batch])

        # Remove these samples from the list.
        del samples[select:select + to_take]

    print('\n  DONE - Selected {:,} batches.\n'.format(len(batch_ordered_sentences)))

    # =========================
    #        Add Padding
    # =========================    

    print('Padding out sequences within each batch...')

    py_inputs = []
    py_attn_masks = []
    py_labels = []

    # For each batch...
    for (batch_inputs, batch_labels) in zip(batch_ordered_sentences, batch_ordered_labels):

        # New version of the batch, this time with padded sequences and now with
        # attention masks defined.
        batch_padded_inputs = []
        batch_attn_masks = []
        
        # First, find the longest sample in the batch. 
        # Note that the sequences do currently include the special tokens!
        max_size = max([len(sen) for sen in batch_inputs])

        # For each input in this batch...
        for sen in batch_inputs:
            
            # How many pad tokens do we need to add?
            num_pads = max_size - len(sen)

            # Add `num_pads` padding tokens to the end of the sequence.
            padded_input = sen + [tokenizer.pad_token_id]*num_pads

            # Define the attention mask--it's just a `1` for every real token
            # and a `0` for every padding token.
            attn_mask = [1] * len(sen) + [0] * num_pads

            # Add the padded results to the batch.
            batch_padded_inputs.append(padded_input)
            batch_attn_masks.append(attn_mask)

        # Our batch has been padded, so we need to save this updated batch.
        # We also need the inputs to be PyTorch tensors, so we'll do that here.
        # Todo - Michael's code specified "dtype=torch.long"
        py_inputs.append(torch.tensor(batch_padded_inputs))
        py_attn_masks.append(torch.tensor(batch_attn_masks))
        py_labels.append(torch.tensor(batch_labels))
    
    print('  DONE.')

    # Return the smart-batched dataset!
    return (py_inputs, py_attn_masks, py_labels)


# Main function

In [3]:
# If there's a GPU available...
print("_______________________________________________________________")
print("Setting-up PyTorch...........")
torch.cuda.empty_cache()
gc.collect()
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
# Copy the model to the GPU.
model.to(device)
    
def run(year):
    #year = 2006
    print("_______________________________________________________________")
    print("Loading Data...........")
    cols = ['ID','Author','LinkID','Subreddit','Text','Time','Link']
    data = pd.read_csv(
        subs_dir+str(year)+".csv",
        header=None,
        names=cols,
        engine="python",
        encoding="utf-8"
    )
    data.drop(['LinkID'],
              axis=1,
              inplace=True)
    print("_______________________________________________________________")
    print("Data Pre-processing...........")
    data_clean = [clean(post) for post in data.Text.astype('str')]
    print("dataset length:",len(data_clean))
    print("_______________________________________________________________")
    print("Prepare smart batches...........")
    # Use our new function to completely prepare our dataset.
    (py_inputs, py_attn_masks, py_labels) = make_smart_batches(data_clean, data.index, batch_size)
    print("_______________________________________________________________")
    print("Run prediction on data...........")
    print('Predicting labels for {:,} test sentences...'.format(len(data.index)))
    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels = [], []

    # Choose an interval on which to print progress updates.
    update_interval = good_update_interval(total_iters=len(py_inputs), num_desired_updates=10)

    # Measure elapsed time.
    t0 = time.time()

    # Put model in evaluation mode
    model.eval()

    # For each batch of training data...
    for step in range(0, len(py_inputs)):

        # Progress update every 100 batches.
        if step % update_interval == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Calculate the time remaining based on our progress.
            steps_per_sec = (time.time() - t0) / step
            remaining_sec = steps_per_sec * (len(py_inputs) - step)
            remaining = format_time(remaining_sec)

            # Report progress.
            print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs), elapsed, remaining))

        # Copy the batch to the GPU.
        b_input_ids = py_inputs[step].to(device)
        b_input_mask = py_attn_masks[step].to(device)
        b_labels = py_labels[step].to(device)

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        outputs =  F.softmax(logits, dim=1)
        preds, labs = torch.max(outputs, 1)
        # Move logits and labels to CPU
        proba = outputs.detach().cpu().numpy()
        # logits = preds.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels

        predictions.append(proba)
        true_labels.append(label_ids)

    print('    DONE.')
    print("_______________________________________________________________")
    print("Preparing prediction file...........")
    predictions = np.concatenate(predictions, axis=0)
    # Choose the label with the highest score as our prediction.
    probs = np.max(predictions, axis=1).flatten()
    label = np.argmax(predictions, axis=1).flatten()
    true_labels = np.concatenate(true_labels, axis=0)
    dataset = pd.DataFrame({'Index': true_labels, 'Probability':probs,'labels': label})
    dataset['Label'] = dataset['labels'].map({0:'highly_toxic', 1:'slightly_toxic',2:'non_toxic'})
    del dataset['labels']
    dataset = dataset.sort_values(by=['Index'])
    dataset = dataset.reset_index()
    del dataset['Index']
    dataset['ID'] = data.ID
    dataset['Author'] = data.Author
    dataset['Text'] = data.Text
    dataset['Subreddit'] = data.Subreddit
    dataset['Time'] = data.Time
    dataset['Link'] = data.Link
    dataset = dataset[['ID','Author','Subreddit','Text','Time','Link','Probability','Label']]
    print(dataset.head())
    out = pred_dir+str(year)+".csv"
    os.makedirs(os.path.dirname(out), exist_ok=True)
    dataset.to_csv(out,index=False)
    del dataset
    del data
    del data_clean
#     del model
#     del tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

_______________________________________________________________
Setting-up PyTorch...........
There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1080


In [None]:
for year in range(2006,2008):
    print(year)
    run(year)

2006
_______________________________________________________________
Loading Data...........
_______________________________________________________________
Data Pre-processing...........
dataset length: 10917
_______________________________________________________________
Prepare smart batches...........
Creating Smart Batches from 10,917 examples with batch size 128...

Tokenizing 10,917 samples...
  Tokenized 0 samples.
  Tokenized 1,000 samples.
  Tokenized 2,000 samples.
  Tokenized 3,000 samples.
  Tokenized 4,000 samples.
  Tokenized 5,000 samples.
  Tokenized 6,000 samples.
  Tokenized 7,000 samples.
  Tokenized 8,000 samples.
  Tokenized 9,000 samples.
  Tokenized 10,000 samples.
DONE.
    10,917 samples

    10,917 samples after sorting

Creating batches of size 128...

  DONE - Selected 86 batches.

Padding out sequences within each batch...
  DONE.
_______________________________________________________________
Run prediction on data...........
Predicting labels for 10,917 