<a href="https://colab.research.google.com/github/Michael-David-Lam/NLP-Final-Project/blob/main/Experiment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# !pip install datasets
# !pip install gensim
# !pip install --upgrade numpy gensim
# !pip install tensorflow
# !pip install seqeval



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [1]:
from datasets import load_dataset
import pandas as pd
from gensim.models import Word2Vec
import ast

In [2]:
dataset = load_dataset("surrey-nlp/PLOD-CW-25")

print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/268 [00:00<?, ?B/s]

PLOD-CW-25-Train.parquet:   0%|          | 0.00/343k [00:00<?, ?B/s]

PLOD-CW-25-Test.parquet:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

PLOD-CW-25-Val.parquet:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/150 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 150
    })
})


In [3]:
df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

In [4]:
#To combine train,test,split for vectorisation to build a rich embedding space

import ast

def safe_parse(col):
    return [ast.literal_eval(row) if isinstance(row, str) else row for row in col]

train_tokens = safe_parse(df_train["tokens"])
val_tokens = safe_parse(df_val["tokens"])
test_tokens = safe_parse(df_test["tokens"])

all_tokens = train_tokens + val_tokens + test_tokens

In [5]:
type(df_train['tokens'][0])

list

In [6]:
#List of Parameters (Just using Lab as reference)
num_features = 300
min_word_count = 1
num_workers = 2
window_size = 3
subsampling = 1e-3

In [7]:
w2v_model = Word2Vec(
    sentences=all_tokens,
    vector_size=num_features,
    window=window_size,
    min_count=min_word_count,
    workers=num_workers,
    sample=subsampling
)

Preparation of data (Creating vocab + embeding matrix from model itself)

In [8]:
from gensim.models import KeyedVectors
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

vocab = w2v_model.wv.index_to_key
word_index = {word: idx + 1 for idx, word in enumerate(vocab)}

embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_matrix[i] = w2v_model.wv[word]


In [9]:
all_lengths = [len(seq) for seq in train_tokens + val_tokens + test_tokens]

print(max(all_lengths))
print(np.mean(all_lengths))
print(np.percentile(all_lengths, 90))
#likelihood to chose 76 to get the best balance of computational power + covers 90% of sequence

371
41.17041666666667
76.0


In [10]:
# Encode as Bi-LSTM requires numbers to process
from tensorflow.keras.preprocessing.sequence import pad_sequences

def encode_sentences(token_lists, word_index, max_len):
    sequences = [[word_index.get(token, 0) for token in tokens] for tokens in token_lists]
    return pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

#using 90th percentile first
max_len = 76

X_train = encode_sentences(train_tokens, word_index, max_len)
X_val = encode_sentences(val_tokens, word_index, max_len)
X_test = encode_sentences(test_tokens, word_index, max_len)

In [11]:
from sklearn.preprocessing import LabelEncoder

all_tags = df_train["ner_tags"].tolist() + df_val["ner_tags"].tolist() + df_test["ner_tags"].tolist()

tag_encoder = LabelEncoder()
tag_encoder.fit([tag for seq in all_tags for tag in seq])
num_classes = len(tag_encoder.classes_)

def encode_tags(tag_lists, max_len):
    encoded = [tag_encoder.transform(tags) for tags in tag_lists]
    padded = pad_sequences(encoded, maxlen=max_len, padding="post", truncating="post", value=-1)  # -1 for masking
    return padded

y_train = encode_tags(df_train["ner_tags"].tolist(), max_len)
y_val = encode_tags(df_val["ner_tags"].tolist(), max_len)
y_test = encode_tags(df_test["ner_tags"].tolist(), max_len)


In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Masking
from tensorflow.keras.utils import to_categorical

input = Input(shape=(max_len,))
model = Embedding(input_dim=embedding_matrix.shape[0],
                  output_dim=embedding_matrix.shape[1],
                  weights=[embedding_matrix],
                  input_length=max_len,
                  mask_zero=True,
                  trainable=False)(input)
model = Bidirectional(LSTM(units=64, return_sequences=True))(model)
model = Dense(num_classes, activation="softmax")(model)

model = Model(input, model)
model.summary()




In [13]:
def create_sample_weights(y_padded):
    return (y_padded != -1).astype("float32")

sample_weights_train = create_sample_weights(y_train)
sample_weights_val = create_sample_weights(y_val)

y_train = np.where(y_train == -1, 0, y_train)
y_val = np.where(y_val == -1, 0, y_val)



In [14]:
#Training of model (using Adam as a baseline optimiser)

from tensorflow.keras.optimizers import Adam

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


history = model.fit(
    X_train, y_train[..., np.newaxis],
    validation_data=(X_val, y_val[..., np.newaxis], sample_weights_val),
    sample_weight=sample_weights_train,
    batch_size=32,
    epochs=5
)


Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 172ms/step - accuracy: 0.4081 - loss: 0.8244 - val_accuracy: 0.3661 - val_loss: 0.7002
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 167ms/step - accuracy: 0.3731 - loss: 0.6609 - val_accuracy: 0.3856 - val_loss: 0.6009
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 161ms/step - accuracy: 0.4044 - loss: 0.5805 - val_accuracy: 0.3946 - val_loss: 0.5632
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 148ms/step - accuracy: 0.4006 - loss: 0.5491 - val_accuracy: 0.3932 - val_loss: 0.5656
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 167ms/step - accuracy: 0.4002 - loss: 0.5306 - val_accuracy: 0.3983 - val_loss: 0.5282


In [15]:
from seqeval.metrics import classification_report, f1_score
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)

idx2tag = {i: t for i, t in enumerate(tag_encoder.classes_)}

true_labels = [[idx2tag[idx] for idx in row if idx != -1] for row in y_test]
pred_labels = [[idx2tag[idx] for idx in row[:len(true_labels[i])]] for i, row in enumerate(y_pred_labels)]

print("F1 Score:", f1_score(true_labels, pred_labels))
# print(classification_report(true_labels, pred_labels))

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 178ms/step
F1 Score: 0.46535580524344566
