In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import random
import warnings
import tensorflow as tf

from tensorflow import keras as ks
from tensorflow.keras import layers
from keras.models import load_model
from keras import callbacks 

# Preprocessing data

We start by loading the dataset and clean both training and validation data with regex replacements.

In [2]:
def clean_data(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z0-9)\s\*\+-\/\(\)=&|]','', text)
    return text

train = pd.read_csv("/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv")
valid = pd.read_csv("/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv")

train['Body'] = train['Body'].apply(clean_data)
valid['Body'] = valid['Body'].apply(clean_data)


# Tokenize data
The Tokenizer is initialized and fitted on the training data. The cleaned columns are then converted to integer vectors, before being padded with zeroes or shortened to an uniform length of 75 words.

In [3]:
maxlength = 75

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n\r', lower=True, split=" ")
tokenizer.fit_on_texts(train['Body'])

train_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['Body']), maxlen=maxlength)
valid_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(valid['Body']), maxlen=maxlength)

# One-hot encode labels
The label columns are then converted from classes to to integers, before being encoded as a one-hot matrix representation.

In [4]:
train_y = train['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})
valid_y = valid['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})

train_y = ks.utils.to_categorical(train_y, num_classes=3)
valid_y = ks.utils.to_categorical(valid_y, num_classes=3)

# Count word occurences
Print number of word occurences in order to identify an appropriate input dimension.

In [5]:
sortedwordindex = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
# print(sortedwordindex)

# Defining the model
Define a Keras Sequential model. The first layer is a word embedding layer that creates vectors between similar or associated words. Two layers of LSTM with half the dimensionality of the embedding layer is then applied before adding a final softmax activation layer with three possible outputs. The model is then compiled with an SGD-optimizer, using categorical crossentropy as loss function, and recording the accuracy as a metric.

In [6]:
model = ks.Sequential()
model.add(layers.Embedding(input_length=maxlength, input_dim=10000, output_dim=128))    
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.LSTM(64))
model.add(layers.Dense(3, activation="softmax"))
model.compile(optimizer=ks.optimizers.SGD(learning_rate=(0.55)), loss=ks.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 75, 128)           1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 75, 64)            49408     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 3)                 195       
Total params: 1,362,627
Trainable params: 1,362,627
Non-trainable params: 0
_________________________________________________________________


# Model training
The model is trained with the training data, recording the performance on, but not training on, the validation set after each training epoch. The model is using an EarlyStopping callback in order to terminate when the model has stopped improving (more precisely when the validation loss starts increasing for at least 5 epochs). The model is then tested on the validation set and final loss and accuracy is printed.


In [7]:
earlystopping = callbacks.EarlyStopping(monitor ="val_loss", mode ="min", patience = 5, restore_best_weights = True) 
  
history_callback = model.fit(train_x, train_y, batch_size = 128, epochs = 20, validation_data =(valid_x, valid_y), callbacks =[earlystopping]) 

loss_history = history_callback.history

loss, acc = model.evaluate(valid_x, valid_y, verbose=1)
print('Loss:\t\t', loss, '\nAccuracy:\t', acc)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss:		 0.3030911087989807 
Accuracy:	 0.8784666657447815


# Printing metrics
Prints the loss and accuracy values for both training and validation set for each epoch during training.

In [8]:
print("Validation loss:\n")
print(str(loss_history["val_loss"]) + "\n\n")
print("Validation accuracy:\n")
print(str(loss_history["val_accuracy"]) + "\n\n")
print("Training loss:\n")
print(str(loss_history["loss"]) + "\n\n")
print("Training accuracy:\n")
print(str(loss_history["accuracy"]) + "\n\n")

Validation loss:

[0.4851851761341095, 0.4611198604106903, 0.4196327328681946, 0.3634752035140991, 0.3502151072025299, 0.45915427803993225, 0.4284970462322235, 0.36386317014694214, 0.31658536195755005, 0.32163509726524353, 0.3013896048069, 0.2828918993473053, 0.35380280017852783, 0.2645973563194275, 0.3163857161998749, 0.2856082022190094, 0.2657149136066437, 0.2614314556121826, 0.26874974370002747, 0.30309104919433594]


Validation accuracy:

[0.7031999826431274, 0.7075333595275879, 0.781000018119812, 0.8294000029563904, 0.8429333567619324, 0.7454000115394592, 0.7779333591461182, 0.8358666896820068, 0.8560666441917419, 0.8572666645050049, 0.8681333065032959, 0.8757333159446716, 0.846666693687439, 0.8844666481018066, 0.8644000291824341, 0.8831333518028259, 0.8902000188827515, 0.8915333151817322, 0.8884000182151794, 0.8784666657447815]


Training loss:

[0.6785090565681458, 0.45103707909584045, 0.43752461671829224, 0.40499600768089294, 0.3743339478969574, 0.43556246161460876, 0.442637532