In [98]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import ast
import optuna

In [48]:
import sys
print(sys.executable)

/Users/iangosling/PycharmProjects/Imp ML Course/.venv/bin/python


In [49]:
# ensuring 
print(tf.__version__)

2.16.2


Most of this code comes from Googles Training course on text classification:  https://developers.google.com/machine-learning/guides/text-classification.  This has been adapted and improved to include k-folds cross validation to produce a more consistent result in training.  Given the volume of training data is low.

### Step 1 - Import Train and Test Data and convert to required data type

In [138]:
# Load training set from CSV, convert CSV text to list and join into a single string
train_data = pd.read_csv("train_data.csv")
train_data['preprocessed_text'] = train_data['preprocessed_text'].apply(ast.literal_eval)
train_data['preprocessed_text'] = train_data['preprocessed_text'].apply(' '.join)
train_texts = train_data['preprocessed_text'].to_numpy()

# Load test set from CSV, convert CSV text to list and join into a single string
test_data = pd.read_csv("test_data.csv")
test_data['preprocessed_text'] = test_data['preprocessed_text'].apply(ast.literal_eval)
test_data['preprocessed_text'] = test_data['preprocessed_text'].apply(' '.join)
test_texts = test_data['preprocessed_text'].to_numpy()


In [166]:
# Create dictionary of integer values for the labels so we can convert back at a later dat
def create_label_dict(df, col_name):
    unique_values = df[col_name].unique()
    value_dict = {value: i for i, value in enumerate(unique_values)}
    return value_dict

label_dict = create_label_dict(train_data, 'clause_type')

# convert existing text labels to integers 1-10 using dictionary
def transform_column(df, col_name, mapping_dict):
    return df[col_name].map(mapping_dict).values.astype(np.float32)

train_labels = transform_column(train_data, 'clause_type', label_dict)
test_labels = transform_column(test_data, 'clause_type', label_dict)


### Step 2 - Function for Vectorisation

In [126]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000
# Whether text should be split into word or character n-grams. One of 'word', 'char'.
TOKEN_MODE = 'word'
# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(x_train_fold, y_train_fold, x_val_fold):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        x_train_fold: list, training text strings.
        y_train_fold: np.ndarray, training labels (integers).
        x_val_fold: list, validation text strings.

    # Returns
        x_train: vectorized training texts
        x_val: vectorized validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train_fold = vectorizer.fit_transform(x_train_fold).astype(np.float32)  # Fixed variable name

    # Vectorize validation texts.
    x_val_fold = vectorizer.transform(x_val_fold).astype(np.float32)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train_fold.shape[1]))
    selector.fit(x_train_fold, y_train_fold)
    x_train = selector.transform(x_train_fold).astype('float32')
    x_val = selector.transform(x_val_fold).astype('float32')

    # Return both the vectorized texts and the original labels (y_train and y_val).
    return x_train, x_val

### Step 3 - Define Function to create last layer according to data parameters

In [54]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

### Step 4 - Define function to build simple multi-layer perceptron  

In [165]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    
    return model

### Step 5 - Create a function to train the n_gram model

In [163]:
# train model with kfolds validation
def train_ngram_model_kfolds(data,k=5,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2,
                      num_classes=10):

    """Trains n-gram model on the given dataset.

    # Arguments
        k: no of folds 
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        num_classes: int, number of output classes.
    """
    # Get the data.
    train_texts, train_labels = data
    
    # Prepare k-fold cross-validation
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    
    # Store validation results for each fold
    val_accuracies = []
    val_losses = []

    # Perform K-Fold Cross-Validation
    for train_idx, val_idx in kfold.split(train_texts):
        # Split data into training and validation for this fold
        # K-fold cross-validation
        x_train_fold = train_texts[train_idx]
        y_train_fold = train_labels[train_idx]
        x_val_fold = train_texts[val_idx]
        y_val_fold = train_labels[val_idx]
        
        # Vectorize texts.
        x_train, x_val = ngram_vectorize(x_train_fold, y_train_fold, x_val_fold) 
        # Create model instance.
        model = mlp_model(layers=layers,
                          units=units,
                          dropout_rate=dropout_rate,
                          input_shape=x_train.shape[1:],
                          num_classes=num_classes)
        
        loss = 'sparse_categorical_crossentropy'
        optimizer = Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    
        # Create callback for early stopping on validation loss. If the loss does not decrease in two consecutive tries, stop training.
        callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
        
        # Train and validate the model on this fold
        history = model.fit(
            x_train,
            y_train_fold,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, y_val_fold),
            verbose=2,
            batch_size=batch_size
        )
        # Collect the last validation accuracy and loss
        history = history.history
        val_accuracies.append(history['val_accuracy'][-1])
        val_losses.append(history['val_loss'][-1])
    
    # Compute average validation accuracy and loss
    avg_val_accuracy = np.mean(val_accuracies)
    avg_val_loss = np.mean(val_losses)
    
    # Print results.
    print('Average Validation Accuracy: {:.4f}, Average Loss: {:.4f}'.format(avg_val_accuracy, avg_val_loss))

    # Save model.
    model.save('Clause_class_model.keras')
    
    return avg_val_accuracy, avg_val_loss

### Step 6 - Build Model and train model

In [164]:
data = (train_texts, train_labels)
train_ngram_model_kfolds(data)



10 softmax
Epoch 1/1000


  super().__init__(**kwargs)


11/11 - 0s - 43ms/step - accuracy: 0.5834 - loss: 2.2311 - val_accuracy: 0.7988 - val_loss: 2.0974
Epoch 2/1000
11/11 - 0s - 10ms/step - accuracy: 0.8728 - loss: 1.9840 - val_accuracy: 0.8142 - val_loss: 1.8538
Epoch 3/1000
11/11 - 0s - 11ms/step - accuracy: 0.8999 - loss: 1.7105 - val_accuracy: 0.8359 - val_loss: 1.6045
Epoch 4/1000
11/11 - 0s - 10ms/step - accuracy: 0.9209 - loss: 1.4495 - val_accuracy: 0.8452 - val_loss: 1.3685
Epoch 5/1000
11/11 - 0s - 11ms/step - accuracy: 0.9255 - loss: 1.2014 - val_accuracy: 0.8576 - val_loss: 1.1532
Epoch 6/1000
11/11 - 0s - 11ms/step - accuracy: 0.9325 - loss: 0.9936 - val_accuracy: 0.8576 - val_loss: 0.9798
Epoch 7/1000
11/11 - 0s - 10ms/step - accuracy: 0.9325 - loss: 0.8166 - val_accuracy: 0.8545 - val_loss: 0.8440
Epoch 8/1000
11/11 - 0s - 11ms/step - accuracy: 0.9426 - loss: 0.6811 - val_accuracy: 0.8545 - val_loss: 0.7399
Epoch 9/1000
11/11 - 0s - 10ms/step - accuracy: 0.9403 - loss: 0.5774 - val_accuracy: 0.8576 - val_loss: 0.6649
Epoch



10 softmax
Epoch 1/1000


  super().__init__(**kwargs)


11/11 - 0s - 41ms/step - accuracy: 0.5834 - loss: 2.2359 - val_accuracy: 0.8050 - val_loss: 2.1075
Epoch 2/1000
11/11 - 0s - 11ms/step - accuracy: 0.8852 - loss: 1.9914 - val_accuracy: 0.8452 - val_loss: 1.8624
Epoch 3/1000
11/11 - 0s - 11ms/step - accuracy: 0.9092 - loss: 1.7238 - val_accuracy: 0.8514 - val_loss: 1.6111
Epoch 4/1000
11/11 - 0s - 11ms/step - accuracy: 0.9154 - loss: 1.4486 - val_accuracy: 0.8669 - val_loss: 1.3679
Epoch 5/1000
11/11 - 0s - 11ms/step - accuracy: 0.9209 - loss: 1.1911 - val_accuracy: 0.8607 - val_loss: 1.1527
Epoch 6/1000
11/11 - 0s - 11ms/step - accuracy: 0.9317 - loss: 0.9769 - val_accuracy: 0.8607 - val_loss: 0.9779
Epoch 7/1000
11/11 - 0s - 11ms/step - accuracy: 0.9333 - loss: 0.7983 - val_accuracy: 0.8700 - val_loss: 0.8444
Epoch 8/1000
11/11 - 0s - 11ms/step - accuracy: 0.9379 - loss: 0.6649 - val_accuracy: 0.8793 - val_loss: 0.7422
Epoch 9/1000
11/11 - 0s - 11ms/step - accuracy: 0.9496 - loss: 0.5606 - val_accuracy: 0.8885 - val_loss: 0.6648
Epoch



10 softmax
Epoch 1/1000


  super().__init__(**kwargs)


11/11 - 0s - 40ms/step - accuracy: 0.6109 - loss: 2.2347 - val_accuracy: 0.8509 - val_loss: 2.1018
Epoch 2/1000
11/11 - 0s - 11ms/step - accuracy: 0.8837 - loss: 1.9888 - val_accuracy: 0.8385 - val_loss: 1.8527
Epoch 3/1000
11/11 - 0s - 11ms/step - accuracy: 0.8899 - loss: 1.7106 - val_accuracy: 0.8447 - val_loss: 1.5967
Epoch 4/1000
11/11 - 0s - 11ms/step - accuracy: 0.9078 - loss: 1.4395 - val_accuracy: 0.8540 - val_loss: 1.3518
Epoch 5/1000
11/11 - 0s - 11ms/step - accuracy: 0.9155 - loss: 1.1845 - val_accuracy: 0.8571 - val_loss: 1.1367
Epoch 6/1000
11/11 - 0s - 11ms/step - accuracy: 0.9295 - loss: 0.9797 - val_accuracy: 0.8602 - val_loss: 0.9599
Epoch 7/1000
11/11 - 0s - 11ms/step - accuracy: 0.9380 - loss: 0.7953 - val_accuracy: 0.8665 - val_loss: 0.8232
Epoch 8/1000
11/11 - 0s - 11ms/step - accuracy: 0.9426 - loss: 0.6613 - val_accuracy: 0.8602 - val_loss: 0.7209
Epoch 9/1000
11/11 - 0s - 14ms/step - accuracy: 0.9395 - loss: 0.5551 - val_accuracy: 0.8540 - val_loss: 0.6444
Epoch



10 softmax
Epoch 1/1000


  super().__init__(**kwargs)


11/11 - 0s - 40ms/step - accuracy: 0.6209 - loss: 2.2340 - val_accuracy: 0.7919 - val_loss: 2.1110
Epoch 2/1000
11/11 - 0s - 10ms/step - accuracy: 0.8837 - loss: 1.9865 - val_accuracy: 0.7950 - val_loss: 1.8772
Epoch 3/1000
11/11 - 0s - 11ms/step - accuracy: 0.8860 - loss: 1.7176 - val_accuracy: 0.7919 - val_loss: 1.6387
Epoch 4/1000
11/11 - 0s - 10ms/step - accuracy: 0.9163 - loss: 1.4465 - val_accuracy: 0.7857 - val_loss: 1.4083
Epoch 5/1000
11/11 - 0s - 11ms/step - accuracy: 0.9248 - loss: 1.1947 - val_accuracy: 0.7981 - val_loss: 1.2067
Epoch 6/1000
11/11 - 0s - 11ms/step - accuracy: 0.9256 - loss: 0.9833 - val_accuracy: 0.8199 - val_loss: 1.0415
Epoch 7/1000
11/11 - 0s - 11ms/step - accuracy: 0.9341 - loss: 0.8084 - val_accuracy: 0.8354 - val_loss: 0.9117
Epoch 8/1000
11/11 - 0s - 11ms/step - accuracy: 0.9457 - loss: 0.6776 - val_accuracy: 0.8447 - val_loss: 0.8120
Epoch 9/1000
11/11 - 0s - 10ms/step - accuracy: 0.9388 - loss: 0.5716 - val_accuracy: 0.8447 - val_loss: 0.7351
Epoch



10 softmax
Epoch 1/1000


  super().__init__(**kwargs)


11/11 - 0s - 40ms/step - accuracy: 0.5783 - loss: 2.2374 - val_accuracy: 0.8540 - val_loss: 2.1051
Epoch 2/1000
11/11 - 0s - 11ms/step - accuracy: 0.8581 - loss: 1.9997 - val_accuracy: 0.8634 - val_loss: 1.8628
Epoch 3/1000
11/11 - 0s - 10ms/step - accuracy: 0.8845 - loss: 1.7361 - val_accuracy: 0.8571 - val_loss: 1.6166
Epoch 4/1000
11/11 - 0s - 11ms/step - accuracy: 0.9093 - loss: 1.4781 - val_accuracy: 0.8634 - val_loss: 1.3798
Epoch 5/1000
11/11 - 0s - 10ms/step - accuracy: 0.9240 - loss: 1.2275 - val_accuracy: 0.8665 - val_loss: 1.1716
Epoch 6/1000
11/11 - 0s - 11ms/step - accuracy: 0.9240 - loss: 1.0187 - val_accuracy: 0.8727 - val_loss: 0.9968
Epoch 7/1000
11/11 - 0s - 11ms/step - accuracy: 0.9341 - loss: 0.8452 - val_accuracy: 0.8696 - val_loss: 0.8577
Epoch 8/1000
11/11 - 0s - 11ms/step - accuracy: 0.9395 - loss: 0.7119 - val_accuracy: 0.8789 - val_loss: 0.7529
Epoch 9/1000
11/11 - 0s - 11ms/step - accuracy: 0.9411 - loss: 0.5999 - val_accuracy: 0.8727 - val_loss: 0.6684
Epoch

(0.8653904676437378, 0.39823356866836546)

### Step 8 - Tune Hyperparameters

### Step 10 - Test Model
