In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/Mestrado/pre_processing_important.csv')

In [None]:
from ast import literal_eval
df['labels'] = df['labels'].apply(literal_eval)

In [None]:
! pip install iterative-stratification

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

#### one hote enconding on the labels ########
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

#### get the text column

X = df['text']


In [None]:
import tensorflow_hub as hub
import numpy as np
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

from nltk.tokenize import sent_tokenize



In [None]:
embed(['Hello World']).shape


In [None]:
import nltk
nltk.download('punkt')


def vectorize_sentences(X):

    new_X = []
    for instance in X:
      sentences = sent_tokenize(instance)
      current_embedding = embed(sentences).numpy()
      new_X.append(current_embedding)
    
    return np.array(new_X)




In [None]:
from sklearn.decomposition import PCA

def vectorize_sentences_PCA(X):
    pca = PCA(n_components=128)
    new_X = []
    for instance in X:
      sentences = sent_tokenize(instance)
      current_embedding = embed(sentences).numpy()

      print(current_embedding.shape)

      new_current_embedding = []
      for ph in range(current_embedding.shape[0]):
        embedding = current_embedding[ph] 
        new_current_embedding.append(np.array(pca.fit([embedding])))
      new_current_embedding = np.array(new_current_embedding)
      print(new_current_embedding.shape)

      new_X.append(new_current_embedding)
    
    return np.array(new_X)

In [None]:
def vectorize_sentences_flat(X):

    new_X = []
    for instance in X:
      sentences = sent_tokenize(instance)
      current_embedding = embed(sentences).numpy()
      for x1 in current_embedding:
        new_X.append(x1)
    
    return np.array(new_X)


In [None]:
class DenseTranspose(tf.keras.layers.Layer):
    def __init__(self, dense, activation=None, **kwargs):
        self.dense = dense
        self.activation = tf.keras.activations.get(activation)
        super().__init__(**kwargs)
    def build(self, batch_input_shape):
        self.biases = self.add_weight(name="bias",    initializer="zeros",shape=[self.dense.input_shape[-1]])
        super().build(batch_input_shape)
    def call(self, inputs):
        z = tf.matmul(inputs, self.dense.weights[0], transpose_b=True)
        return self.activation(z + self.biases)


def get_encoder(autoencoder, n_bottleneck):
    idx = np.where([layer.output.shape[1]==n_bottleneck for layer in autoencoder.layers])[0][-1]
    bottleneck = autoencoder.layers[idx].output
    encoder = tf.keras.Model(inputs=autoencoder.input, outputs=bottleneck)
    return encoder

class TIED_WEIGHTS_NORMALIZED_AUTOENCODER_trainer():
    def __init__(self, input_dims, middle_dims, bottleneck_dims):
        self.input_dims = input_dims
        self.middle_dims = middle_dims
        self.bottleneck_dims = bottleneck_dims
        self.autoencoder = self.build_autoencoder()
        self.encoder = None
        self.trained = False
        
    def build_autoencoder(self):
        
        dense_1 = tf.keras.layers.Dense(self.middle_dims, activation="selu")
        dense_2 = tf.keras.layers.Dense(self.bottleneck_dims, activation="selu")
        
        #tied_encoder
        inputs = tf.keras.Input(shape=(self.input_dims))
        l_flatten = tf.keras.layers.Flatten()(inputs)
        l_en1 = dense_1(l_flatten)
        l_en2 = dense_2(l_en1)

        # normalized bottleneck
        l_en2 = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis = 1))(l_en2)

        #tied_decoder
        l_dc1 = DenseTranspose(dense_2, activation="selu")(l_en2)
        l_dc2 = DenseTranspose(dense_1, activation="linear")(l_dc1)
        
        return tf.keras.models.Model(inputs, l_dc2)
    
    def train(self, embedded_sentences,  bs = 128, lr = 1e-4, ep = 100, verbose = 0):
        
        self.autoencoder.compile(loss="mse",optimizer=tf.keras.optimizers.Adam(learning_rate=lr))
        print(self.autoencoder.summary())

        self.autoencoder.fit(embedded_sentences, 
                    embedded_sentences, 
                    epochs=ep, 
                    batch_size=bs, 
                    verbose=verbose, 
                    callbacks = [])
        
        self.trained = True
        
    def build_tf_model(self):
        if not self.trained:
            print("Model is not trained, encoder has not been built.")
        else:
            encoder = get_encoder(self.autoencoder, self.bottleneck_dims)
            output_ = encoder(encoder.input)
            self.encoder = tf.keras.Model(encoder.input, output_)
        
    def save(self, output_path):
        self.encoder.save('./{}/encoder'.format(output_path))
        self.autoencoder.save('./{}/autoencoder'.format(output_path))

In [None]:
nn  = TIED_WEIGHTS_NORMALIZED_AUTOENCODER_trainer(512, 256, 128)
nn.build_autoencoder()
nn.train(embedded_sentences)

In [None]:
import gensim
import gensim.downloader
embedding = gensim.downloader.load('glove-twitter-50')

In [None]:
def word_embed(X):
  new_X = []
  from nltk.tokenize import wordpunct_tokenize
  for text in X:
    sample = []
    for word in wordpunct_tokenize(text):
      if word in embedding:
        sample.append(embedding[word])
    new_X.append(np.array(sample))
  return new_X

In [None]:
new_X = np.array(new_X)

In [None]:
# univariate lstm example
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM, GRU, Input,  Bidirectional, SimpleRNN
from keras.layers import Dense, Dropout, LeakyReLU, Conv1D, MaxPool1D, GlobalMaxPool1D
from keras.regularizers import l1

def create_model():
  # define model
  model = Sequential()
  #model.add(Conv1D(filters=8, kernel_size=3,strides=1, padding="causal", activation="relu", input_shape=(7, 128), activity_regularizer=l1(0.0001) ) )
  #model.add(Conv1D(filters=4, kernel_size=3,strides=1, padding="causal", activation="relu", activity_regularizer=l1(0.0001)) )
  #model.add(Conv1D(filters=8, kernel_size=3,strides=1, padding="causal", activation="relu") )
  #model.add(MaxPool1D())
  model.add(Input(shape=[None, 512], ragged=True))
  #model.add(Conv1D(filters=8, kernel_size=3,strides=1, padding="causal", activation="relu", activity_regularizer=l1(0.0001) ) )
  model.add(Bidirectional(LSTM(4, activation=LeakyReLU(alpha=0.1), activity_regularizer=l1(0.0001))))
  model.add(Dense(48))
  model.add(Dropout(0.3))
  model.add(Dense(3))
  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"])
  return model

In [None]:
model = create_model()

In [None]:
model.summary()

In [None]:
new_X.shape

In [None]:
import tensorflow as tf
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np
from matplotlib import pyplot
from sklearn.decomposition import PCA

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in msss.split(np.array(X), np.array(y)):

  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  X_train  =vectorize_sentences(X_train)
  X_test = vectorize_sentences(X_test)

  

  X_train = tf.ragged.constant(X_train)
  X_test = tf.ragged.constant(X_test)


  
  for i in range(5):
    model  =create_model()
    history = model.fit(X_train, y_train, epochs=25, batch_size=16, validation_data=(X_test, y_test))
    print(history.history)
    pyplot.plot(history.history['loss'], color='blue')
    pyplot.plot(history.history['val_loss'], color='orange')
    pyplot.xlabel('epochs')
    pyplot.ylabel('Loss value')
    pyplot.plot()
    y_pred = model.predict(X_test)
    y_pred = y_pred > 0.5
    #print('%d) TrainRMSE=%f, TestRMSE=%f' % (i, history.history['loss'], history.history['val_loss']))

    #from sklearn.metrics import classification_report, multilabel_confusion_matrix, f1_score, accuracy_score # we can use gmean

    # print(multilabel_confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix, f1_score, accuracy_score # we can use gmean

print(multilabel_confusion_matrix(y_test, y_pred))

In [None]:
df = df[df['labels'].apply(lambda x : len(x) > 0)]

In [None]:
df.shape

In [None]:
len(df['text'].iloc[0])

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_results = bert_model(text_preprocessed)



In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

In [None]:
tf.keras.utils.plot_model(classifier_model)


In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)