In [1]:
import pandas
import ast
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from collections import Counter
import json
import numpy as np

In [2]:
import torch
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import ast

from utils import CLASSES

In [3]:
def final_preprocess(filename):
    df = pandas.read_csv("intermediate.csv", delimiter="#", encoding="utf-8")
    with open(filename, "w+", encoding="utf-8") as file:
        file.writelines("topic#text\n")
        for i in range(len(df)):
            topic = df["topic"][i]

            stems = ast.literal_eval(df["text"][i])
            stems = stems[:100]
            stems.insert(0,'[CLS]')
            stems.append('[SEP]')

            file.writelines(str(topic) + "#" + str(stems) + "\n")

def final_preprocess2(filename):
    df = pandas.read_csv("filtered.csv", delimiter="#", encoding="utf-8")
    with open(filename, "w+", encoding="utf-8") as file:
        file.writelines("topic#text\n")
        for i in range(len(df)):
            topic = df["topic"][i]

            stems = ast.literal_eval(df["text"][i])
            stems.insert(0,'[CLS]')
            stems.append('[SEP]')

            file.writelines(str(topic) + "#" + str(stems) + "\n")

In [4]:
sentences = []
with open("final.csv", "r", encoding="utf-8") as file:
    file.readline()
    for line in file:
        res = ast.literal_eval(line.split("#")[1][:-1])
        res = ' '.join(res)
        sentences.append(res)

labels = []
with open("final.csv", "r", encoding="utf-8") as file:
    file.readline()
    for line in file:
            labels.append(int(line.split("#")[0]))

nb_labels = len(CLASSES)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [5]:
# maximum sequence length.
MAX_LEN = 128

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [6]:
# attention masks
attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

# split data into train and validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                                                    random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                       random_state=2018, test_size=0.1)

In [7]:
# Convert all of our data into tensors
train_inputs = torch.LongTensor(train_inputs)
validation_inputs = torch.LongTensor(validation_inputs)
train_labels = torch.LongTensor(train_labels)
validation_labels = torch.LongTensor(validation_labels)
train_masks = torch.Tensor(train_masks)
validation_masks = torch.Tensor(validation_masks)

# batch size for training.
batch_size = 16


In [8]:
# DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [9]:
# Load BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=nb_labels)
model.cuda()

# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

In [10]:
# accuracy function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# loss and accuracy
train_loss_set = []
# Number of training epochs
epochs = 20


In [11]:
# BERT training
for _ in trange(epochs, desc="Epoch"):

    # training mode
    model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    # Train
    for step, batch in enumerate(train_dataloader):
        
        batch = tuple(t for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        
        loss = model(b_input_ids.to(device), token_type_ids=None, attention_mask=b_input_mask.to(device), labels=b_labels.to(device))
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()
        
        tr_loss += loss.item()
        del loss
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

    # evaluation mode
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))

In [12]:
# training performance
plt.figure(figsize=(15, 8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()
torch.save({"state_dict": model.state_dict(),
            "epoch": 20},
           "topicClassifier.pth")

In [13]:
# test
sentences = ["[CLS] " + query + " [SEP]" for query in query_data_test]
labels = intent_data_label_test

# tokenize
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
MAX_LEN = 128
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


In [14]:
# attention masks
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [15]:
# test tensors
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
batch_size = 32
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


In [16]:
# evaluation mode
model.eval()
predictions, true_labels = [], []

# Predict
for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions.append(logits)
    true_labels.append(label_ids)

In [17]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []
for i in range(len(true_labels)):
    matthews = matthews_corrcoef(true_labels[i],
                                 np.argmax(predictions[i], axis=1).flatten())
    matthews_set.append(matthews)

In [18]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print('Classification accuracy using BERT Fine Tuning: {0:0.2%}'.format(
    matthews_corrcoef(flat_true_labels, flat_predictions)))