In [1]:
import os
#!pip install -U numpy
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from ast import literal_eval

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [2]:
data = pd.read_csv("spring/total.csv")

In [3]:
data["categorias"] = data["categorias"].apply(
    lambda x: literal_eval(x)
)
data["categorias"].values[:5]

array([list(['Engineering', 'Computational Intelligence', 'Artificial Intelligence']),
       list(['Engineering', 'Computational Intelligence', 'Artificial Intelligence']),
       list(['Engineering', 'Computational Intelligence', 'Artificial Intelligence', 'Professional Computing']),
       list(['Engineering', 'Computational Intelligence', 'Control, Robotics, Mechatronics', 'Artificial Intelligence']),
       list(['Computer Science', 'Computer Systems Organization and Communication Networks', 'Communications Engineering, Networks', 'Wireless and Mobile Communication'])],
      dtype=object)

In [4]:
test_split = 0.4

# Initial train and test split.
train_df, test_df = train_test_split(
    data,
    test_size=test_split,
    stratify=data["categorias"].values,
)
# Splitting the test set further into validation
# and new test sets.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

In [5]:
terms = tf.ragged.constant(train_df["categorias"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)
vocab = lookup.get_vocabulary()


def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


print("Vocabulary:\n")
print(vocab)

Vocabulary:

['[UNK]', 'Engineering', 'Artificial Intelligence', 'Computational Intelligence', 'Communications Engineering, Networks', 'Computer Science', 'Cyber-physical systems, IoT', 'Professional Computing', 'Machine Learning', 'Control, Robotics, Mechatronics', 'Data Structures and Information Theory', 'Science, Humanities and Social Sciences, multidisciplinary', 'Data Engineering', 'Industrial and Production Engineering', 'Mechanical Engineering', 'Computer Communication Networks', 'Signal, Image and Speech Processing', 'Medicine & Public Health', 'Statistics, general', 'Computer Imaging, Vision, Pattern Recognition and Graphics', 'Data Mining and Knowledge Discovery', 'Computer Science, general', 'Robotics and Automation', 'Manufacturing, Machines, Tools, Processes', 'Mobile and Network Security', 'Business and Management', 'Science, multidisciplinary', 'Biomedical Engineering and Bioengineering', 'Science and Technology Studies', 'Image Processing and Computer Vision', 'Systems

In [6]:
sample_label = train_df["categorias"].iloc[0]
print(f"Original label: {sample_label}")

label_binarized = lookup([sample_label])
print(f"Label-binarized representation: {label_binarized}")

Original label: ['Engineering', 'Construction Management', 'Sustainable Architecture/Green Buildings', 'Sustainable Development', 'Building Materials', 'Building Construction and Design']
Label-binarized representation: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [7]:
train_df["abstract"].apply(lambda x: len(x.split(" "))).describe()

count    8308.000000
mean      112.098339
std        48.845628
min         1.000000
25%        82.000000
50%       108.000000
75%       138.000000
max      1050.000000
Name: abstract, dtype: float64

In [8]:
max_seqlen = 139
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE


def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["categorias"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["abstract"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [9]:
train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

In [10]:
next(iter(train_dataset))

(<tf.Tensor: shape=(128,), dtype=string, numpy=
 array([b"family business groups (fbgs) similar non-family owned conglomerates need focus innovation stay relevant keep growing sustainably maintain competitive edges vis-\xc3\xa0-vis competitors extant research highlights family firms\xe2\x80\x99 lack proactive interest towards investing innovation present counter-narrative fbgs organize manage innovation across affiliated group companies multiple case study four fbgs based secondary data theorize \xe2\x80\x9chow fbg's innovation activities managed\xe2\x80\x9d found fbg owners\xe2\x80\x99 aspirations globally competitive top management teams\xe2\x80\x99 aspirations support innovation-led growth critical antecedents increased attention innovation fbg firms also found family business groups support innovation building enabling structure context group affiliated firm levels support includes shared group resources building inter-organizational intra-organizational knowledge exchange mechanis

In [11]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:3]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label[0])}")
    print(" ")

Abstract: b'synthesizing program code natural language challenging task natural language utterances tend ambiguous require substantial prior knowledge interpret recent solutions approach difficulties different ways models constrain inputs operate large variety sentences tend less accurate others limit space possible inputs requiring meet fixed structure makes similar code language paper offers middle ground approaches train transition-based neural network descriptions programming tasks generated using context-free grammar templates show model able generalize solve synthesis problems described natural language'
Label(s): ['Engineering' 'Artificial Intelligence' 'Computational Intelligence'
 'Control, Robotics, Mechatronics']
 
Abstract: b'collecting cybercrime evidence internet typically involves reconnaissance analyses information extracted scouring internet especially deep web often requires manual effort time-consuming hence imperative efficient framework intelligent tool gather cybe

In [12]:
# Source: https://stackoverflow.com/a/18937309/7636462
vocabulary = set()
train_df["abstract"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print( vocabulary_size)

67977


In [13]:
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)

# `TextVectorization` layer needs to be adapted as per the vocabulary from our
# training set.
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
validation_dataset = validation_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
test_dataset = test_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [14]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [

            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
            #layers.Dense( len(vocab), activation="sigmoid"),
        ]  # More on why "sigmoid" has been used here in a moment.
    )
    return shallow_mlp_model



def make_model():
    shallow_mlp_model = tf.keras.models.Sequential([
        layers.Dense(512, activation="relu"),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
        tf.keras.layers.Dense(6, activation='softmax')
        ])
    return shallow_mlp_model

In [15]:
epochs = 10
#5 para acelerar las primeras versiones, solo queremos ver que no explota
opt = tf.optimizers.Adam()
shallow_mlp_model = make_model()
shallow_mlp_model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy","binary_accuracy"])
#categorical_crossentropy
#sparse_categorical_crossentropy

In [None]:
history = shallow_mlp_model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)

Epoch 1/10
 4/65 [>.............................] - ETA: 16s - loss: 34.1911 - accuracy: 0.4141 - binary_accuracy: 0.5366 

In [None]:
def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_result("loss")
plot_result("accuracy")
plot_result("binary_accuracy")

In [None]:
keras.models.save_model(shallow_mlp_model,'spring/radEd_model_v2.hdf5')
#shallow_mlp_model = tf.keras.models.load_model('spring/radEd_model_v2.hdf5')

In [None]:
_,accura, binary_acc  = shallow_mlp_model.evaluate(test_dataset)
print(f"Categorical accuracy on the test set: {round(binary_acc * 100, 2)}%.")
print(f"Categorical accuracy on the test set: {round(accura * 100, 2)}%.")

In [None]:
# Create a model for inference.
model_for_inference = keras.Sequential([text_vectorizer, shallow_mlp_model])

# Create a small dataset just for demoing inference.
inference_dataset = make_dataset(test_df.sample(100), is_train=False)
text_batch, label_batch = next(iter(inference_dataset))
predicted_probabilities = model_for_inference.predict(text_batch)

# Perform inference.
for i, text in enumerate(text_batch[:10]):
    label = label_batch[i].numpy()[None, ...]
    #print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label[0])}")
    predicted_proba = [proba for proba in predicted_probabilities[i]]
    top_3_labels = [
        x
        for _, x in sorted(
            zip(predicted_probabilities[i], lookup.get_vocabulary()),
            key=lambda pair: pair[0],
            reverse=True,
        )
    ][:3]
    print(f"Predicted Label(s): ({', '.join([label for label in top_3_labels])})")
    print(" ")

In [None]:

[
        x
        for  x in sorted(
            zip(predicted_probabilities[60], lookup.get_vocabulary()),
            key=lambda pair: pair[0],
            reverse=True,
        )
    ][:6]