In [None]:
import pickle
import random
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, f1_score, auc

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer


# random seed for reproducibility
random.seed(7)

# for TPU debugging
#ctpu up --tpu-size=[TPU_VERSION] --tf-version=[TF VERSION]

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

df = pd.read_csv("../input/6-class-mvp/6_class_MVP_dataset_2D.csv", names=["text", "target"],
                 encoding="UTF-8", skiprows=1, low_memory=False, index_col=False)
print("done loading")

# not sure if needed
df=df.sample(frac=1)
df=df.astype(str)

# seaborn coutplot to see class balance
sns.coutplot(df.target)

print("dataset size: {size} data points.".format(size=df.size))
print("index size:{size}".format(size=df.index.size))
num_classes=df.target.nunique()
print('class count: {c}'.format(c=num_classes))

inputs=df.text
targets=df.target

# label targets with label encoder
le=LabelEncoder()
targets-=le.fit_transform(targets)
targets=targets.reshape(-1,1)

# split training set from rest of data
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2)

# max words for input size
max_words=1000

# max length for sequence padding (can be reduced)
max_len=300

# tokenize inputs for word embedding
tok=Tokenizer(num_words=max_words)
tok.fit_on_texts(train_inputs)

# turn text to sequences for word embedding
sequences=tok.texts_to_sequences(train_inputs)

# pad sequences (add 0 at the end if len(sequences) < max_len)
sequences_matrix=sequence.pad_sequences(sequences,maxlen=max_len)

# test sequences and padding
test_sequences = tok.texts_to_sequences(test_inputs)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

# define Neural Network Sequential Keras model
def RNN():
    model = Sequential()
    model.add(Input(name='inputs',shape=[max_len]))
    model.add(Embedding(max_words,150,input_length=max_len))
    model.add(LSTM(256, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(512,name='Dense1'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(300,name='dense1'))
    model.add(Dropout(0.5))
    model.add(Dense(300,name='dense2'))
    model.add(Dropout(0.5))
    model.add(Dense((num_classes),name='out_layer'))
    model.add(Activation('softmax'))
    return model

# plotting functions
def plot(history, info_type='loss'):
    plt.plot(history.history[info_type], label=[info_type])
    try:
        plt.plot(history.history['val_' + info_type], label=['val_' + info_type])
    except Exception:
        print(f'no val_{info_type}')
    plt.title(info_type)
    plt.legend()

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [None]:
with tpu_strategy.scope():
    model = RNN()
    model.summary()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=(tf.keras.optimizers.Adam(learning_rate=0.001)),metrics=['accuracy'])
    history = model.fit(sequences_matrix,train_targets,batch_size=8000,epochs=250, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_accuracy',min_delta=0.0005, patience=5)])
    accr = model.evaluate(test_sequences_matrix,test_targets)
    print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
    test_targets_list = list(test_targets)
    predictions = model.predict_classes(test_sequences_matrix, verbose=1, batch_size=8000)

In [None]:
plot(history)

In [1]:
plot(history, 'accuracy')

In [None]:
classes=df.target.unique()
classes = list(classes)
print(classes)
print(classification_report(test_targets_list, predictions, target_names=classes))
accuracy_score(test_targets_list, predictions)

In [None]:
cnf_matrix = confusion_matrix(test_targets_list, predictions)
plt.figure(figsize=(50, 50))
plot_confusion_matrix(cnf_matrix, classes=classes, title="Confusion matrix")
plt.show()

In [None]:
model.save('6_C_Sequence.h5')
pickle.dump(tok, open('6_C_Sequence.pkl', "wb"), protocol=0)
print('Model Saved')