In [55]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import random as rd
from tensorflow.keras import layers
import tensorflow_hub as hub
import string
import tf_keras

# Import Datasets

In [56]:
dev_dir_10p = "/mnt/c/Users/iosif/Desktop/Python/Tensorflow/skimlit/10p/dev.txt"
test_dir_10p = "/mnt/c/Users/iosif/Desktop/Python/Tensorflow/skimlit/10p/test.txt"
train_dir_10p = "/mnt/c/Users/iosif/Desktop/Python/Tensorflow/skimlit/10p/train.txt"

In [57]:
#-----------------------------My Method--------------------
dict_train = {"line_number":0,
              "target":"",
              "text":"",
              "total_lines":0}

list_dict_train = [{"line_number":0,
              "target":"",
              "text":"",
              "total_lines":0}]

def read_lines(filename):
    with open(filename, "r") as file:
        return file.readlines()

def list_of_dicts(list_name,lines):
    for i in range(1,len(lines)):
        parts = lines[i].split("\t")
        if len(parts) > 1:
            list_name.append({'line_number': i, 'target': lines[i].split("\t")[0],'text': lines[i].split("\t")[1], 'total_lines': 0})
    return list_name

train_lines = read_lines(train_dir_10p)
len(train_lines)

#-------------------Tutorial Method------------------------
def preprocess_text_to_line_numbers(filename):
    input_lines = read_lines(filename) #get all lines from the file
    abstract_lines = "" #create empty abstract
    abstract_samples = [] # create an empty list of abstracts
    for line in input_lines:
        if line.startswith("###"): #check if it's an ID line
            abstract_id = line
            abstract_lines = "" #reset abstract string if it's an ID line
            
        elif line.isspace(): #check if it's a new line
            abstract_line_split = abstract_lines.splitlines() #split abstract into separate lines 
            
            #itterate through every line and count them
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {} 
                target_text_split = abstract_line.split("\t") #split target label from text
                line_data["target"] = target_text_split[0] #get target label
                line_data["text"] = target_text_split[1].lower() #get text
                line_data["line_number"] = abstract_line_number #line thta appears in the abstract
                line_data["total_lines"] = len(abstract_line_split) -1 #numbers of lines of the abstract
                abstract_samples.append(line_data)
        else: #if nothing is fulfilled, then its a label
            abstract_lines += line
    return abstract_samples
            

In [58]:
train_samples = preprocess_text_to_line_numbers(train_dir_10p)
val_samples = preprocess_text_to_line_numbers(dev_dir_10p)
test_samples = preprocess_text_to_line_numbers(test_dir_10p)

In [59]:
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)

# Get lists of sentencees and make 'em numbers

In [60]:
train_sentences = train_df["text"].tolist()
val_sentences = val_df["text"].tolist()
test_sentences = test_df["text"].tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

(2211861, 28932, 29493)

In [61]:
one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_oh = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1,1))
val_labels_oh = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1,1))
test_labels_oh = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1,1))

In [62]:
train_labels_oh

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

In [63]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())

# Start experimenting

## Model 0: Scikit learn

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score

In [65]:
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(X=train_sentences,
            y=train_labels_encoded)

In [66]:
baseline_preds = model_0.predict(val_sentences)

baseline_validation = (baseline_preds == val_labels_encoded).sum()/len(val_labels_encoded)
print(baseline_validation)

0.7497580533665146


## Model_1: Conv1D

In [67]:
# Preparing the text
sent_length = [len(sentence.split()) for sentence in train_sentences]
avg_sent_length = int(np.mean(sent_length))
percentile_95 = int(np.percentile(sent_length, 95))
print(f"average length = {avg_sent_length} \nlength of 95%  = {percentile_95}")

average length = 26 
length of 95%  = 54


In [68]:
# Text Vectorization
max_tokens = 68000
text_vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=percentile_95, #number of words in vocabulary
                                                    max_tokens=max_tokens)              # Desired length of sentences

# Adapt Vectorizer to training data
text_vectorizer.adapt(train_sentences)

# Proper vocab size
vocab_size = text_vectorizer.vocabulary_size()

# Make the vocabulary
text_vocab_20k = text_vectorizer.get_vocabulary()

# check the configuration of the vectorizer
text_vectorizer.get_config()

# Create an embedding. Vectorizer int to Embedding float
token_embed = tf.keras.layers.Embedding(input_dim=vocab_size, # length of vocabulary
                                        output_dim=128, # usualy works good
                                        mask_zero=True) # Way to handle difference sentence lengths. Removes the zero values of the vectorizer

In [69]:
# Create tensorflow datasets to run faster
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_oh)).batch(128).cache().prefetch(tf.data.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_oh)).batch(128).cache().prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_oh)).batch(128).cache().prefetch(tf.data.AUTOTUNE)

In [70]:
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = token_embed(x)
x = layers.Conv1D(filters=128, kernel_size=5, activation="relu", padding="same")(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(5, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs)

model_1.compile(loss="categorical_crossentropy",
                optimizer="Adam",
                metrics=["accuracy"])

model_1.fit(x=train_dataset,
            validation_data=valid_dataset,
            epochs=2)

Epoch 1/2




[1m17281/17281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 7ms/step - accuracy: 0.8025 - loss: 0.5481 - val_accuracy: 0.8476 - val_loss: 0.4275
Epoch 2/2
[1m17281/17281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 7ms/step - accuracy: 0.8520 - loss: 0.4191 - val_accuracy: 0.8513 - val_loss: 0.4139


<keras.src.callbacks.history.History at 0x7efe5cb69bd0>

# Model_2: Pretrained embeddings

In [71]:
import tf_keras
 
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False)
 
model_2 = tf_keras.Sequential([
  sentence_encoder_layer,
  # tf_keras.layers.Dense(256, activation="relu"),
  tf_keras.layers.Dense(128, activation="relu"),
  tf_keras.layers.Dense(64, activation="relu"),
  tf_keras.layers.Dense(5, activation="softmax")
])
    
model_2.compile(loss="categorical_crossentropy",
                optimizer="Adam",
                metrics=["accuracy"])

model_2.fit(batch_size=128,
            epochs=3,
            x=train_dataset,
            validation_data=valid_dataset)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7f004ab80ad0>

## Model_3: Conv1D Character level embeddings

In [101]:
# sentences to characters
def split_chars(text):
    return " ".join(list(text))

# split datasets to letter level
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]

# finding the average length
char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)

# find the character length of 95% of sentences
character_length = int(np.percentile(char_lens, 95))

# get all keyboard characters
alphabet = string.ascii_lowercase + string.digits + string.punctuation

# character level vectorizer
num_char_tokens = len(alphabet) +2 #for space and OOV token
char_vectorizer = tf.keras.layers.TextVectorization(max_tokens=num_char_tokens,
                                    output_sequence_length=character_length)

# adapt the vectorizer
char_vectorizer.adapt(train_chars)

# actual size of the vocabulary
char_vocab = char_vectorizer.get_vocabulary()

# character level embedding
char_embed = layers.Embedding(input_dim=len(char_vocab),
                              output_dim=25, # some number he found on the paper
                              mask_zero=False)

In [116]:
inputs = layers.Input(shape=(1,), dtype="string")
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(64, kernel_size=10, padding="same", activation="relu")(char_embeddings)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(256, activation="relu")(x)
outputs = layers.Dense(5,activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs)

model_3.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.SGD(),
                metrics=["accuracy"])

model_3.fit(validation_data=valid_dataset,
            batch_size=32,
            epochs=3,
            x=train_dataset)

Epoch 1/3
[1m17281/17281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 6ms/step - accuracy: 0.4194 - loss: 1.3242 - val_accuracy: 0.4298 - val_loss: 1.2864
Epoch 2/3
[1m17281/17281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 6ms/step - accuracy: 0.4323 - loss: 1.2827 - val_accuracy: 0.4310 - val_loss: 1.2846
Epoch 3/3
[1m17281/17281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 6ms/step - accuracy: 0.4334 - loss: 1.2806 - val_accuracy: 0.4313 - val_loss: 1.2834


<keras.src.callbacks.history.History at 0x7f000c265290>

# Model_4: Character and token embedding

In [126]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=True)

In [132]:
# 1 token model
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_input")

token_embeddings = sentence_encoder_layer(token_inputs)
token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
token_model = (token_inputs, token_outputs)

# 2 character model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="character_input")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)

# 3 concatenate
token_char_concat = layers.Concatenate(name="token_hybrid")([token_model.output,
                                                             char_model.output])

# 4 create output layers
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(128, activation="relu")(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(5, activation="softmax")(final_dropout)

# 5 build the model
model_4 = tf.keras.Model(inputs=[token_inputs, char_inputs], 
                         outputs=[token_model.output, char_model.output], name= "model_4")

TypeError: Exception encountered when calling layer 'keras_layer_27' (type KerasLayer).

Binding inputs to tf.function failed due to `A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
`. Received args: (<KerasTensor shape=(None,), dtype=string, sparse=False, name=token_input>,) and kwargs: {} for signature: (inputs: TensorSpec(shape=<unknown>, dtype=tf.string, name=None)).

Call arguments received by layer 'keras_layer_27' (type KerasLayer):
  • inputs=<KerasTensor shape=(None,), dtype=string, sparse=False, name=token_input>
  • training=None

# Evaluation

In [117]:
print(f"model 0= {model_0.score(X=test_sentences, y=test_labels_encoded)}")
print(f"model 1= {model_1.evaluate(test_dataset)}")
print(f"model 2= {model_2.evaluate(test_dataset)}")
print(f"model 3= {model_3.evaluate(test_dataset)}")

model 0= 0.7517377004712983
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8534 - loss: 0.4242
model 1= [0.42994996905326843, 0.850235641002655]
model 2= [0.5383727550506592, 0.7994778156280518]
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.4383 - loss: 1.2764
model 3= [1.2805085182189941, 0.43498456478118896]


# Metrics 

In [76]:
mod_1_preds = model_1.predict(test_dataset)
mod_1_preds = np.argmax(mod_1_preds, axis=1)
mod_1_preds

[1m 68/231[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step



[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


array([0, 0, 0, ..., 4, 1, 1])

In [77]:
accuracy = accuracy_score(y_true = val_labels_encoded,y_pred =baseline_preds )
f1 = f1_score(y_true = val_labels_encoded,y_pred =baseline_preds, average="macro")
precision = precision_score(y_true = val_labels_encoded,y_pred =baseline_preds,average="macro")
print(accuracy, f1, precision)

0.7497580533665146 0.6595220846821199 0.7134207274186407


In [78]:
accuracy = accuracy_score(y_true = test_labels_encoded,y_pred =mod_1_preds)
f1 = f1_score(y_true = test_labels_encoded,y_pred =mod_1_preds, average="macro")
precision = precision_score(y_true = test_labels_encoded,y_pred =mod_1_preds,average="macro")
print(accuracy, f1, precision)

0.8502356491370834 0.7870200876397437 0.8126855611836474
