In [91]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import f1_score

print(tf.__version__)

1.12.0


In [None]:
## Import Data
GENRES_IDX = {"blues":0, "gospel":1, "rap":2, "country":3, "rock":4}
GENRE_NAMES = ["blues", "gospel", "rap", "country", "rock"]

NON_SPARSE_FEATS = ['n_wrds', 'avg_wrd_len',
       'n_lines', 'avg_line_len', 'n_contractions', 'contraction_density',
       'vocab_size', 'edge_density',
       'edge_density_weighted', 'edge_weight_var', 'degree_var',
       'degree_var_weighted', 'degree_avg', 'degree_avg_weighted',
       'comp_size_avg']

SPARSE_FEATS = ['word2vec_avg', 'topk']

N_NONSPARSE = len(NON_SPARSE_FEATS)
N_SPARSE = len(SPARSE_FEATS)

def df_to_arr(df):
    x = df[NON_SPARSE_FEATS]
    x_arr = x.values
    
    if N_SPARSE > 0:
        y_arr_list = []
        for sparse_feat in SPARSE_FEATS:
            y = df[sparse_feat]
            y_list = [list(y.values[i]) for i in range(y.values.shape[0])]
            y_arr = np.array(y_list)
            y_arr_list.append(y_arr)

        y_arr = y_arr_list[0]
        for i in range(len(y_arr_list) - 1):
            y_arr = np.concatenate((y_arr, y_arr_list[i+1]), axis=1)
    
    # Use one hot encoding labels
    kron = np.eye(5)
    labels = df["genre"].values
    labels_one_hot = np.array([kron[:, GENRES_IDX[genre]] for genre in labels])
    
    # Use integer labels
    labels_sparse = np.array([GENRES_IDX[genre] for genre in labels])
    
    if N_SPARSE > 0:
        data = np.concatenate((x_arr, y_arr), axis=1)
    else:
        data = x_arr
    return data, labels_sparse

def read_data(fn):
    df = pd.read_pickle(fn)
    df.fillna(0, inplace=True)
    df_train = df.query("data_split == 'train'").copy()
    df_test = df.query("data_split == 'test'").copy()
    df_val = df.query("data_split == 'val'").copy()
    
    lyrics_train = np.array(df_train["lyrics_stripped"].values)
    lyrics_test = np.array(df_test["lyrics_stripped"].values)
    lyrics_val = np.array(df_val["lyrics_stripped"].values)
    
    X_train, Y_train = df_to_arr(df_train)
    X_val, Y_val = df_to_arr(df_val)
    X_test, Y_test = df_to_arr(df_test)
    
    if N_NONSPARSE > 0:
        ## Standardize Data
        scaler = preprocessing.StandardScaler().fit(X_train[:, :N_NONSPARSE])
        X_train[:, :len(NON_SPARSE_FEATS)] = scaler.transform(X_train[:, :N_NONSPARSE])
        X_val[:, :len(NON_SPARSE_FEATS)] = scaler.transform(X_val[:, :N_NONSPARSE])
        X_test[:, :len(NON_SPARSE_FEATS)] = scaler.transform(X_test[:, :N_NONSPARSE])
    if N_SPARSE > 0:
        scaler_sparse = preprocessing.MinMaxScaler().fit(X_train[:, N_NONSPARSE:])
        X_train[:, len(NON_SPARSE_FEATS):] = scaler_sparse.transform(X_train[:, N_NONSPARSE:])
        X_val[:, len(NON_SPARSE_FEATS):] = scaler_sparse.transform(X_val[:, N_NONSPARSE:])
        X_test[:, len(NON_SPARSE_FEATS):] = scaler_sparse.transform(X_test[:, N_NONSPARSE:])
    
    return X_train, Y_train, X_val, Y_val, X_test, Y_test, lyrics_train, lyrics_val, lyrics_test

ALL_DATA_FN = "all.data"
X_train, Y_train, X_val, Y_val, X_test, Y_test,  lyrics_train, lyrics_val, lyrics_test = read_data(ALL_DATA_FN)
DIM = X_train.shape[1]

In [None]:
# Shuffle X_train and Y_train

N = X_train.shape[0]
idx = np.array([i for i in range(N)])
np.random.shuffle(idx)
X_train = X_train[idx]
Y_train = Y_train[idx]

In [None]:
# Calculate the gradients of each output node w.r.t. each input feature

inputs = tf.keras.Input(shape=(DIM,))  # Returns a placeholder tensor

# A layer instance is callable on a tensor, and returns a tensor.
x = keras.layers.Dense(200, activation=tf.nn.relu)(inputs)
predictions = keras.layers.Dense(5, activation=tf.nn.softmax)(x)
model = tf.keras.Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [
  # Interrupt training if `val_loss` stops improving for over 2 epochs
  tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
  # Write TensorBoard logs to `./logs` directory
  tf.keras.callbacks.TensorBoard(log_dir='./logs')
]

model.fit(X_train, Y_train,
          epochs=100,
          validation_data = (X_val, Y_val), 
          batch_size=32,
          callbacks=callbacks
         )

# Evaluate Model
model.evaluate(X_test, Y_test, batch_size=32)

In [None]:
X = X_val
Y = Y_val
lyrics = lyrics_val

Y_hat = model.predict(X)
Y_pred = np.argmax(Y_hat, axis=1)

In [None]:
def plot_value_array(i, predictions_array, true_label):
    predictions_array, true_label = predictions_array[i], true_label[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    thisplot = plt.bar(range(5), predictions_array, color="#777777")
    plt.ylim([0, 1]) 
    predicted_label = np.argmax(predictions_array)

    thisplot[predicted_label].set_color('red')
    thisplot[true_label].set_color('blue')
    
i = 4
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plot_value_array(i, Y_hat,  Y)
_ = plt.xticks(range(5), GENRE_NAMES, rotation=45)

In [None]:
### DISPLAY PERFORMANCE STATISTICS

## Adapted from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
cnf_matrix = confusion_matrix(Y, Y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=GENRE_NAMES,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=GENRE_NAMES, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

# Print F1 Scores
print("F1 Scores:")
print(GENRE_NAMES)
f1_score(Y, Y_pred, average=None)


In [None]:
# Calculate Gradients
def calculate_gradients():
    output_tens = predictions
    input_tens = inputs

    grad = [tf.gradients(output_tens[i], input_tens) for i in range(output_tens.shape[1])]

    sess = tf.keras.backend.get_session()
    grad_val = np.squeeze(np.array(sess.run(grad, feed_dict={input_tens: X})))
    return grad_val

grad_val = calculate_gradients()

In [90]:

def get_activations(label_true, label_pred):
    pred_idx = set(np.squeeze(np.argwhere(Y_pred == GENRES_IDX[label_pred])))
    true_idx = set(np.squeeze(np.argwhere(Y == GENRES_IDX[label_true])))
    target_idx = pred_idx.intersection(true_idx)
    return target_idx, np.average(grad_val[:, list(target_idx), :], axis=1)

label_true = "rock"
label_pred = "gospel"
for label_true in GENRE_NAMES:
    for label_pred in GENRE_NAMES:
        print("True label: "+ label_true)
        print("Predicted label: "+ label_pred)
        target_idx, activations = get_activations(label_true, label_pred)
        print(activations)




True label: blues
Predicted label: blues
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
True label: blues
Predicted label: gospel
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
True label: blues
Predicted label: rap
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
True label: blues
Predicted label: country
[[ 0.00e+00  0.00e+00  0.00e+00  0.00e+00  0.00e+00  0.00e+00  0.00e+00
   0.00e+00  0.00e+00  0.00e+00  0.00e+0

In [6]:
df = pd.read_pickle("all.data")

need me a string darlin'
ahoney tied all around your heart
ayeah need me a string baby
girl tied all around your heart
well then i'll know darlin'
i tell ya that we would never part

stop cryin' baby
honey come here and dry your eyes
i want you to stop cryin' baby
girl come here and dry ya eyes
well it's the reason i know yeah babe
ahoney it hurts way down deep inside

i'm goin' crazy
ajust as crazy as i can be
i'm goin' crazy
just as crazy as i can be
well i know what's drivin' me crazy darlin'
ahoney you so sweet ta me
