In [101]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import tensorflow as tf
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
import ast

Most of this code comes from Googles Training course on text classification:  https://developers.google.com/machine-learning/guides/text-classification

### Step 1 - Import Train and Test Data and convert to required data type

In [51]:
# Load training set from CSV, convert CSV text to list and join into a single string
train_data = pd.read_csv("train_data.csv")
train_data['preprocessed_text'] = train_data['preprocessed_text'].apply(ast.literal_eval)
train_data['preprocessed_text'] = train_data['preprocessed_text'].apply(' '.join)
train_texts = train_data['preprocessed_text'].to_list()
train_labels = train_data['clause_type'].to_numpy()

# Load test set from CSV, convert CSV text to list and join into a single string
test_data = pd.read_csv("test_data.csv")
test_data['preprocessed_text'] = test_data['preprocessed_text'].apply(ast.literal_eval)
test_data['preprocessed_text'] = test_data['preprocessed_text'].apply(' '.join)
val_texts = test_data['preprocessed_text'].to_list()
val_labels = test_data['clause_type'].to_numpy()


In [62]:
unique, counts = np.unique(train_labels, return_counts=True)
for u, c in zip(unique, counts):
    print(f"Element: {u}, Count: {c}")

Element: arbitration, Count: 178
Element: capitalization, Count: 146
Element: confidential-information, Count: 182
Element: confidentiality, Count: 159
Element: contribution, Count: 132
Element: indemnification, Count: 151
Element: indemnification-and-contribution, Count: 140
Element: indemnification-by-the-company, Count: 184
Element: participations, Count: 159
Element: payment-of-expenses, Count: 181


In [61]:
unique, counts = np.unique(val_labels, return_counts=True)
for u, c in zip(unique, counts):
    print(f"Element: {u}, Count: {c}")

Element: arbitration, Count: 62
Element: capitalization, Count: 54
Element: confidential-information, Count: 58
Element: confidentiality, Count: 61
Element: contribution, Count: 48
Element: indemnification, Count: 59
Element: indemnification-and-contribution, Count: 40
Element: indemnification-by-the-company, Count: 46
Element: participations, Count: 51
Element: payment-of-expenses, Count: 59


TypeError: can only concatenate str (not "int") to str

### Step 2 - Function for Vectorisation

In [52]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000
# Whether text should be split into word or character n-grams. One of 'word', 'char'.
TOKEN_MODE = 'word'
# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts).astype(np.float32)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts).astype(np.float32)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

### Step 3 - Define Function to create last layer according to data parameters

In [6]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

### Step 4 - Define function to build simple multi-layer perceptron  

In [9]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model

### Step 5 - Create a function to train the n_gram model

In [84]:
def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

        # Vectorize texts.
    x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)

    # Create model instance.
    model = mlp_model(layers=layers,
                      units=units,
                      dropout_rate=dropout_rate,
                      input_shape=x_train.shape[1:],
                      num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('IMDb_mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]

### Step 6 - Vectorise data

In [85]:
ngram_vectorize(train_texts, train_labels, val_texts)



(<1612x17581 sparse matrix of type '<class 'numpy.float32'>'
 	with 233016 stored elements in Compressed Sparse Row format>,
 <538x17581 sparse matrix of type '<class 'numpy.float32'>'
 	with 71621 stored elements in Compressed Sparse Row format>)

### Step 7 - Build Model and train model

In [102]:
print(tf.__version__)

2.12.0


In [86]:
num_classes = 10
data = ((train_texts, train_labels), (val_texts, val_labels))
train_ngram_model(data)



ValueError: Could not interpret optimizer identifier: <keras.optimizers.legacy.adam.Adam object at 0x16dcd8050>

### Step 8 - Tune Hyperparameters

### Step 10 - Test Model
