In [35]:
# Binary classifier - neural network edition
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from tensorflow import keras

In [36]:
batch_size = 64

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "datasets/artist",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "datasets/artist",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)

print(
    "Number of batches in raw_train_ds: %d"
    % tf.data.experimental.cardinality(raw_train_ds)
)
print(
    "Number of batches in raw_val_ds: %d" % tf.data.experimental.cardinality(raw_val_ds)
)

Found 79279 files belonging to 2 classes.
Using 63424 files for training.
Found 79279 files belonging to 2 classes.
Using 15855 files for validation.
Number of batches in raw_train_ds: 991
Number of batches in raw_val_ds: 248


In [37]:
# Extract labels from tf.data.Dataset
y_val = np.concatenate([y for _, y in val_vec], axis=0)

neg, pos = np.bincount(y_val)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 15855
    Positive: 1805 (11.38% of total)



In [38]:
# Set up initial bias
initial_bias=np.log([pos/neg])
print(initial_bias)

[-2.0520618]


In [39]:
# Model constants
max_features = 500000
embedding_dim = 128
sequence_length = 50

# Vectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Function to vectorize text
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [40]:
# Create a model
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def create_model(metrics=METRICS, output_bias=None):
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Add an Embedding layer
    x = layers.Embedding(max_features, embedding_dim)(inputs)
    x = layers.Dropout(0.5)(x)

    # Add Conv1D + global max pooling
    x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
    x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
    x = layers.GlobalMaxPooling1D()(x)

    # Hidden layer
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.5)(x)

    # Prediction layer
    predictions = layers.Dense(1, activation="sigmoid", name="predictions", bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=METRICS)
    
    # Show model summary
    model.summary()
    
    return model

In [41]:
# Vectorize datasets
train_vec = raw_train_ds.map(vectorize_text)
val_vec = raw_val_ds.map(vectorize_text)

In [42]:
# Training parameters
epochs = 3

# Call create_model
model = create_model(output_bias=keras.initializers.Constant(initial_bias[0]))

# Fit the model using the train and test datasets.
model.fit(train_vec, validation_data=val_vec, epochs=epochs)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_4 (Embedding)      (None, None, 128)         64000000  
_________________________________________________________________
dropout_8 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               1651

<tensorflow.python.keras.callbacks.History at 0x7fd94c498ee0>

In [10]:
y_pred = model.predict(val_vec)

In [12]:
# Print max_value of predictions
print(min(y_pred))

[0.09630746]


In [None]:
# Resampled model
