In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout, Conv1D, TimeDistributed
import matplotlib.pyplot as plt

from jiahao_funcs import *

# Data and preproces

In [None]:
!pip install gdown
!gdown "https://drive.google.com/uc?id=1u2wzXvsuscLeFHwXcDwMDaNDy0u_99-t"
!tar -zxf nlu_ATIS_data.tar.gz

In [None]:
train_data = pd.read_csv('./data/train.csv', header=None)
val_data = train_data.tail(900)
train_data = pd.read_csv('./data/train.csv', header=None, nrows=4078)
test_data = pd.read_csv('./data/test.csv', header=None)
print('-------------- Dataset original --------------')
print('Training size:', len(train_data))
print('Validation dataset size:', len(val_data))
print('Test dataset size:', len(test_data))
data = preprocess_entity_recognition(train_data, val_data, test_data, num_words=500)
print('-------------- Dataset preprocesado entity recognition --------------')
print('Vocab size:', data['vocab_size'])
print('Maxlen:', data['maxlen'])
print('Num classes:', data['num_classes'])
print(data["train_X"].shape, data["train_y"].shape)
print(data["val_X"].shape, data["val_y"].shape)
print(data["test_X"].shape, data["test_y"].shape)


# Embeddings

In [None]:
def model_build(num_classes, vocab_size, maxlen, embedding_dim, num_head = 4, ff_dim = 256):
    model = Sequential()
    model.add(TokenAndPositionEmbedding(maxlen, vocab_size, embedding_dim))
    model.add(TransformerBlock(embedding_dim, num_head, ff_dim))
    model.add(keras.layers.Dense(num_classes, activation="softmax"))
    return model
results = provar_embeddings(model_build, preprocessed_data=data, batch_size=32, epochs=30, 
                            embedding_dims=[32, 64, 128, 256, 384, 512], patience=5, runs=5)

# Balanceo de clases
Usar el class_weight en model.fit(class_weight = class_weight)

In [None]:
class_weights = calculate_class_weights(data)
print('Class weights:')
for class_index, weight in class_weights.items():
    print(f'{class_index}: {np.round(weight, 2)}')

In [None]:
def compute_sample_weights_for_sequences(data, class_weights):
    # y_data has shape (num_samples, maxlen, num_classes) and is one-hot encoded
    # class_weights is a dictionary mapping class index to weight

    num_samples, maxlen, num_classes = data['train_y'].shape[0], data['maxlen'], data['num_classes']
    # Initialize sample_weights array with shape (num_samples, maxlen)
    sample_weights_output = np.zeros((num_samples, maxlen))

    for i in range(num_samples):
        for j in range(maxlen):
            # Get the true class index for the current token (one-hot to int)
            class_index = np.argmax(data['train_y'][i, j, :])
            # Look up the weight for this class, default to 1.0 if not found
            sample_weights_output[i, j] = class_weights.get(class_index, 1.0)

    return sample_weights_output

# Compute sample weights for training and validation data
train_sample_weights = compute_sample_weights_for_sequences(data, class_weights)

print('Shape of train_sample_weights:', train_sample_weights.shape)
print('First 5 sample weights for train_data:')
print(train_sample_weights[:5]) # Display first 5x5 segment

In [None]:
def model_build(num_classes, vocab_size, maxlen, embedding_dim, num_head = 4, ff_dim = 256):
    model = Sequential()
    model.add(TokenAndPositionEmbedding(maxlen, vocab_size, embedding_dim))
    model.add(TransformerBlock(embedding_dim, num_head, ff_dim))
    model.add(keras.layers.Dense(num_classes, activation="softmax"))
    return model
results = probar_sample_weights(model_build, preprocessed_data=data, batch_size=16, epochs=15, 
                               sample_weights_list=[None, train_sample_weights], patience=5, runs=5)