In [None]:
!pip install keras

In [None]:
from google.colab import drive

drive.mount("/content/drive")

import pandas as pd
import numpy as np
import io
from pathlib import Path
import shutil
import urllib
import sys
import zipfile
import json

import tqdm
import random
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import load_model
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
import os
from typing import List, Callable, Dict

# Task 1 Corpus

### Download the corpus

In [None]:
def download_url(download_path: Path, url: str):
    urllib.request.urlretrieve(url, filename=download_path)

In [None]:
dataset_url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"
dataset_name = "dependency_treebank"

# print(f"Current work directory: {Path.cwd()}")
dataset_folder = Path.cwd().joinpath("Datasets")
if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

dataset_zip_path = dataset_folder.joinpath("dependency_treebank.zip")
if not dataset_zip_path.exists():
    print("Downloading dataset... ", end="")
    download_url(url=dataset_url, download_path=dataset_zip_path)
    print("Download complete!")
else:
    print("Dataset already downloaded!")
dataset_path = dataset_folder.joinpath(dataset_name)

if not dataset_path.exists():
    print("Extracting dataset... (it may take a while...) ", end="")
    shutil.unpack_archive(dataset_zip_path, dataset_folder)
    print("Extraction completed!")
else:
    print("Dataset already extracted!")

#### Encode the corpus into a pandas DataFrame object

In [None]:
folder = dataset_folder.joinpath(dataset_name)


dataframe_rows = []
for file_path in sorted(folder.glob("*.dp")):
    with file_path.open(mode="r", encoding="utf-8") as text_file:
        # Reading the text
        text = text_file.read()
        # Split sentences (\n\n is used for most NLP datasets to split sentences)
        sentences = text.split("\n\n")

        # Observing each sentence
        for s in sentences:
            sentence = []
            tags = []
            # sentence = [pierre,vinken,,aksjdajs, ]. tags = [NNP,aab,asd....]
            # Taking every line
            for line in s.split("\n"):
                columns = line.split("\t")
                # If every line have word, tag, value
                if len(columns) > 2:
                    # Put words and tags into lists
                    sentence.append(columns[0])
                    tags.append(columns[1])

            # Get the File_ID
            file_id = int(file_path.stem.split("_")[1])
            dataframe_row = {"file_id": file_id, "sentence": sentence, "tag": tags}
            dataframe_rows.append(dataframe_row)
# Create the dataframe
df = pd.DataFrame(dataframe_rows)

FILE_ID, WORD, TAG = df.columns.values

In [None]:
df.head()

#### Splitting Data Train-Test-Validation
Before splitting, lower case convertion is done as a mini preprocessing step.

#### Lower Case

In [None]:
### Make a list lowercase
def lowercase_list(input_list):
    return [item.lower() for item in input_list]

In [None]:
df["sentence"] = df["sentence"].apply(lowercase_list)

#### Splitting

In [None]:
### file indices for train/validation/test
train_ids = np.arange(1, 101)
val_ids = np.arange(101, 151)
test_ids = np.arange(151, 200)

df_train = df[df[FILE_ID].isin(train_ids)]
df_val = df[df[FILE_ID].isin(val_ids)]
df_test = df[df[FILE_ID].isin(test_ids)]

# Task 2 Text encoding

#### Reproducibility

In [None]:
def set_reproducibility(seed):
    random.seed(seed)  # Seed for the Python built-in random module
    np.random.seed(seed)  # Seed for NumPy
    tf.random.set_seed(seed)  # Seed for TensorFlow
    os.environ[
        "TF_DETERMINISTIC_OPS"
    ] = "1"  # Set an environment variable for deterministic TensorFlow operations

#### Hyperparameters for Embedding and First Training Phase

In [None]:
max_sequence_length = int(np.quantile([len(seq) for seq in df_train["sentence"]], 0.99))
hparams = {
    "batch_size": 128,
    "embedding_dim": 100,
    "embedding_trainable": False,
    "learning_rate": 0.005,
    "max_sequence_length": max_sequence_length,
    "vocab_size": 7405,
    "tag_size": 46,
}

#### Vocabulary Creation & Tokenization
Keras Tokenizer Class is used for tokenization and vocabulary creation.

In [None]:
### Use Keras Tokenizer to create Vocabulary

tokenizer = Tokenizer(oov_token="OOV")
tokenizer.fit_on_texts(df_train["sentence"])

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(df_train["tag"])


# Turns text into into padded sequences.
def prep_text(texts, tokenizer, max_sequence_length):
    text_sequences = tokenizer.texts_to_sequences(texts)
    return sequence.pad_sequences(
        text_sequences, maxlen=max_sequence_length, padding="post"
    )


text_train = prep_text(df_train["sentence"], tokenizer, hparams["max_sequence_length"])
text_test = prep_text(df_test["sentence"], tokenizer, hparams["max_sequence_length"])
text_val = prep_text(df_val["sentence"], tokenizer, hparams["max_sequence_length"])

tag_train = prep_text(df_train["tag"], tag_tokenizer, hparams["max_sequence_length"])
tag_test = prep_text(df_test["tag"], tag_tokenizer, hparams["max_sequence_length"])
tag_val = prep_text(df_val["tag"], tag_tokenizer, hparams["max_sequence_length"])

In [None]:
text_train.shape

#### One-Hot-Encoding
One-Hode-Encoding is done to use it in training and evaluation phase.

In [None]:
from keras.utils import to_categorical

num_classes = len(tag_tokenizer.word_index) + 1
y_train = to_categorical(tag_train, num_classes)
y_test = to_categorical(tag_test, num_classes)
y_val = to_categorical(tag_val, num_classes)

#### Tag-Tokens


In [None]:
all_classes = list(tag_tokenizer.word_index.keys())
all_tokens = list(tag_tokenizer.word_index.values())
punct_classes = [",", ".", ":", "``", "''", "$", "#", "sym", "-rrb-", "-lrb-"]
punct_tokens = [tag_tokenizer.word_index[p] for p in punct_classes]
allowed_classes = [
    word for word in tag_tokenizer.index_word.values() if word not in punct_classes
]
allowed_tokens = [token for token in all_tokens if token not in punct_tokens]

print(
    f"Tags: {all_classes}\n"
    + f"All tag-tokens: {all_tokens}\n\n"
    + f"Punctuations: {punct_classes}\n"
    + f"Tokenized punctuations {punct_tokens}\n\n"
    + f"Tags without punctuation: {allowed_classes}\n"
    + f"Tokens will be used in evaluations: {allowed_tokens}"
)

In [None]:
y_train.shape

### Embeddings

#### Downloading Pre-Trained Glove Embeddings
This may take a few minutes to complete.

In [None]:
zip_file_url = "http://nlp.stanford.edu/data/glove.6B.zip"
zip_file = urllib.request.urlopen(zip_file_url)
archive = zipfile.ZipFile(io.BytesIO(zip_file.read()))

#### Creating Embedding Matrix
Downloaded GloVe embeddings were used to create an embedding matrix, where the rows contain the word embeddings for the tokens in the Tokenizer's vocabulary.

In [None]:
embeddings_index = {}
glove_file = "glove.6B.100d.txt"

with archive.open(glove_file) as f:
    for line in f:
        values = line.split()
        word = values[0].decode("utf-8")
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, hparams["embedding_dim"]))
num_words_in_embedding = 0
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        num_words_in_embedding += 1
        embedding_matrix[i] = embedding_vector

In [None]:
### Inspect tokens' embedding vectors
idx_token = 2
print(
    f"Token: {list(tokenizer.word_index.keys())[idx_token]} \nVector: {embedding_matrix[idx_token]}"
)

# [Task 3 - 1.0 points] Model definition

## Model Creation
Keras Subclassing method was used while creating the model.


An instance of the CreateModel class represents the model, and its architecture is specified based on the configuration provided.

The model includes:
* An Embedding Layer
* A Bidirectional LSTM Layer
* Optional Bidirectional LSTM Layer
* Optional dense layer.
* Time-distributed Dense Layer

It generates output sequences using a time-distributed dense layer. The architecture is configured through parameters such as vocabulary size, embedding dimension, LSTM units, and additional layers.

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    LSTM,
    Embedding,
    Dense,
    TimeDistributed,
    Dropout,
    Bidirectional,
    Input,
)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense
import tensorflow as tf


class CreateModel(tf.keras.Model):
    def __init__(self, config):
        super().__init__()

        vocab_size = config["vocab_size"]
        embedding_dim = config["embedding_dim"]
        max_sequence_length = config["max_sequence_length"]
        embedding_matrix = config["embedding_matrix"]
        tag_size = config["tag_size"]
        lstm_units = config["lstm_units"]
        Additional_LSTM = config["Additional_LSTM"]
        Additional_Dense = config["Additional_Dense"]
        add_lstm_units = config["add_lstm_units"]
        add_dense_units = config["add_dense_units"]

        # Embedding layer
        self.embedding_layer = Embedding(
            vocab_size + 1,
            embedding_dim,
            input_length=max_sequence_length,
            weights=[embedding_matrix],
            trainable=False,
        )

        # Bidirectional LSTM layer
        self.bi_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True))
        # Additional LSTM
        self.additional_lstm = (
            Bidirectional(LSTM(add_lstm_units, return_sequences=True))
            if Additional_LSTM
            else None
        )
        # Additional Dense
        self.additional_dense = (
            Dense(add_dense_units, activation="relu") if Additional_Dense else None
        )

        # Dense output layer
        self.dense_output = TimeDistributed(Dense(tag_size, activation="softmax"))

    def call(self, inputs):
        # Define the forward pass
        x = self.embedding_layer(inputs)
        x = self.bi_lstm(x)

        # Add the additional LSTM layer if specified
        if self.additional_lstm:
            x = self.additional_lstm(x)

        # Add the additional Dense layer if specified
        if self.additional_dense:
            x = self.additional_dense(x)

        outputs = self.dense_output(x)
        return outputs

    def build(self, shape):
        x = tf.keras.layers.Input(shape=(shape,))
        return tf.keras.Model(inputs=x, outputs=self.call(x))

In [None]:
config_dict = {
    "vocab_size": 7405,
    "embedding_dim": 100,
    "max_sequence_length": max_sequence_length,
    "embedding_matrix": embedding_matrix,
    "tag_size": 46,
    "lstm_units": 64,
    "Additional_LSTM": False,
    "Additional_Dense": False,
    "add_lstm_units": None,
    "add_dense_units": None,
}
# Create an instance of the custom model
custom_model = CreateModel(config_dict).build(config_dict["max_sequence_length"])

# Compile the model
custom_model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)

# Print model summary

custom_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 56)]              0         
                                                                 
 embedding (Embedding)       (None, 56, 100)           740600    
                                                                 
 bidirectional (Bidirection  (None, 56, 128)           84480     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 56, 46)            5934      
 ributed)                                                        
                                                                 
Total params: 831014 (3.17 MB)
Trainable params: 90414 (353.18 KB)
Non-trainable params: 740600 (2.83 MB)
_________________________________________________________________
