In [1]:
# # Model training using pre-split test/train data instead of splitting in
# model trained on Kaggle - Notebook set up to use cloud TPU on kaggle or GCD
%pip install keras


Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf
import pandas as pd
import seaborn as sns
import numpy as np
import random
import gensim
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Activation
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

BATCH_SIZE = 8000
EPOCHS = 500
random.seed(7)

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

df_train = pd.read_csv('../input/name-num-other/NAME_NUM_OTHER_train_2D_over.csv', names=['input','target'], encoding='ISO-8859-1', skiprows=1, low_memory=False, index_col=False)
print('Dataset Size {size} rows.'.format(size=len(df_train.index)))

df_test = pd.read_csv('../input/name-num-other/NAME_NUM_OTHER_test_2D.csv', names=['input','target'], encoding='ISO-8859-1', skiprows=1, low_memory=False, index_col=False)
print('Dataset Size {size} rows.'.format(size=len(df_test.index)))

sns.countplot(df_train.target)

num_classes = df_train.target.nunique()

train_inputs = df_train.input
train_targets = df_train.target

test_inputs = df_test.input
test_targets = df_test.target

# word2vec
docs = []
for t in df_train.input:
    docs.append(t.split())
w2v_model = gensim.models.Word2Vec(vector_size=300, window=7, min_count=10, workers=8)
w2v_model.build_vocab(docs)
words = w2v_model.wv
vocab_size = len(words)
w2v_model.train(docs, total_examples=len(docs), epochs=32)

# tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.input)
vocab_size = len(tokenizer.word_index) + 1

# tokenize input
x = pad_sequences(tokenizer.texts_to_sequences(df_train.input), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.input), maxlen=300)

# labelencoder
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())
encoder.fit(df_test.target.tolist())

y = encoder.transform(df_train.target.tolist())
y = y.reshape(-1,1)

y_test = encoder.transform(df_test.target.tolist())
y_test = y_test.reshape(-1,1)

embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# embedding layer for NN
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

# callbacks
callbacks = [EarlyStopping(monitor='accuracy', min_delta=0.0001, patience=5, restore_best_weights=True)]

# merge inputs and targets
train_inputs = x
train_targets = y

test_inputs = x_test
test_targets = y_test

# K-Fold cross-validation
#kfold = KFold(n_splits=3, shuffle=True)

# split data into test/train sets
#train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2)


# training loop
with tpu_strategy.scope():
    #for train, test in kfold.split(inputs, targets):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.5))
    model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(300))
    model.add(Dropout(0.5))
    model.add(Dense(300))
    model.add(Dropout(0.5))
    model.add(Dense((num_classes)))
    model.add(Activation('softmax'))
    model.summary()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=(tf.keras.optimizers.Adam(learning_rate=0.001)),metrics=['accuracy'])
    #history = model.fit(inputs[train], targets[train], batch_size = BATCH_SIZE, epochs = EPOCHS, verbose = 1, callbacks=callbacks)
    history = model.fit(train_inputs, train_targets, batch_size = BATCH_SIZE, epochs = EPOCHS, verbose = 1, callbacks=callbacks)


ValueError: Please provide a TPU Name to connect to.

In [None]:
accr = model.evaluate(test_inputs,test_targets)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
import matplotlib.pyplot as plt
def plot(history, info_type='loss'):
    plt.plot(history.history[info_type], label=[info_type])
    try:
        plt.plot(history.history['val_' + info_type], label=['val_' + info_type])
    except Exception:
        print(f'no val_{info_type}')
    plt.title(info_type)
    plt.legend()

plot(history)
plot(history, 'accuracy')

In [None]:
with tpu_strategy.scope():
    test_targets_list = list(test_targets)
    predictions = model.predict_classes(test_inputs, verbose=1, batch_size=BATCH_SIZE)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

classes=df_train.target.unique()
classes = list(classes)
print(classes)
print(classification_report(test_targets_list, predictions, target_names=classes))
accuracy_score(test_targets_list, predictions)

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

classes=df_train.target.unique()
cnf_matrix = confusion_matrix(test_targets_list, predictions)
plt.figure(figsize=(50, 50))
plot_confusion_matrix(cnf_matrix, classes=classes, title="Confusion matrix")
plt.show()

In [None]:
import pickle
model.save('NAME_EAN_PRICE_COLOR_OTHER_SIZE_DESC_OVER_WORDEMB.h5')
pickle.dump(tokenizer, open('NAME_EAN_PRICE_COLOR_OTHER_SIZE_DESC_OVER_WORDEMB.pkl', "wb"), protocol=0)
print('Model Saved')
