In [1]:
# General Imports
import pandas as pd
import random as rd
import numpy as np
import math as m
import sklearn.cluster as skc
import sklearn.neighbors as skn
import sklearn.feature_selection as skf
import matplotlib.pyplot as plt
import scipy as sp

In [2]:
# Kicked Tensorflow to it's own cell so I can work on K-Means at work
import tensorflow as tf
from keras import layers, models

In [2]:
data = pd.read_csv('../Datasets/data.csv', header=0)
genreKey = {"blues": 0,
            "classical": 1,
            "country": 2,
            "disco": 3,
            "hiphop": 4,
            "jazz": 5,
            "metal": 6,
            "pop": 7,
            "reggae": 8,
            "rock": 9}

def dimReduce(data, dims=10):
    # Select Data
    X = np.array(data.iloc[:, 1:29])
    y = np.array(data.iloc[:, 29])
    # Make the feature selection model with ANOVA F Measure
    fs = skf.SelectKBest(score_func=skf.f_classif, k=dims)
    # Run the data through selection to obtain final set
    X_sel = fs.fit_transform(X, y)

    return X_sel, y

def prepData(data, key, split=0.2, dims= 10, doSplit=True):
    # Reduce the dimensionality of the dataset to a set number of features
    x, y = dimReduce(data, dims)
    # Define the number of elements in the test set
    splitRange = int(len(data) * split)
    # Create and randomize an array representing data ordering
    rand = [i for i in range(len(data))]
    rd.shuffle(rand)
    # Create Testing and training arrays
    x_train = np.array([])
    y_train = np.array([])
    x_test = np.array([])
    y_test = np.array([])

    # Populate test arrays with a random split% of the whole set
    for i in range(splitRange):
        # Set the dimensions of the x_test array
        if x_test.ndim == 1:
            x_test = np.array([x[rand[i]]])
        # Append further elements to the existing array
        else:
            x_test = np.append(x_test, [x[rand[i]]], 0)
        # Add label in integer form
        y_test = np.append(y_test, key[y[rand[i]]])
            
    for i in range(splitRange, len(data)):
        # Set the dimensions of the x_train array
        if x_train.ndim == 1:
            x_train = np.array([x[rand[i]]])
        # Append further elements to the existing array
        else:
            x_train = np.append(x_train, [x[rand[i]]], 0)
        # Add label in integer form
        y_train = np.append(y_train, key[y[rand[i]]])

    if (doSplit):
        return x_train, y_train, x_test, y_test
    else:
        return np.concatenate((x_train, x_test)), np.concatenate((y_train, y_test))

def normalize(data):
    # Array for the max value of each feature. Used in normalization
    normax = []
    # Array for the min value of each feature. Used to eliminate negatives
    normin = []

    # Populate max and min arrays
    for i in range(data.shape[1]):
        normin.append(min(data[:, i]))
        normax.append(max(data[:, i]) - normin[i])

    # Normalize each vector in the dataset
    for i in data:
        for j in range(len(i)):
            i[j] = (i[j] - normin[j]) / normax[j]

    return data

# Split the dataset into test and training sets.
x_train, y_train, x_test, y_test = prepData(data, genreKey, dims=10)
x_train = normalize(x_train)
x_test = normalize(x_test)

# Unsplit Data for K-Means
x, y = prepData(data, genreKey, doSplit=False)
x = normalize(x)

In [4]:
upper = data.shape[1] - 1
corrList = [0 for i in range(upper-1)]
for i in range(1, upper):
    for j in range(i + 1, upper):
        corr = sp.stats.pearsonr(data.iloc[:, i], data.iloc[:, j])
        if corr[0] >= 0.7:
            corrList[i] += 1
            corrList[j] += 1
print(corrList)

[0, 1, 1, 0, 1, 4, 2, 4, 2, 3, 0, 0, 0, 1, 3, 3, 3, 2, 3, 3, 4, 2, 1, 1, 0, 0, 0, 0]


In [30]:
#MLP
#Reformat the labels to work with this model
mlp_train_labels = tf.keras.utils.to_categorical(y_train)
mlp_test_labels = tf.keras.utils.to_categorical(y_test)

#Set up & run the network
mlpModel = models.Sequential()
mlpModel.add(layers.Dense(20, activation='relu', input_shape = np.shape(x_train[0])))
mlpModel.add(layers.Dense(60))
mlpModel.add(layers.Dense(120))
mlpModel.add(layers.Dense(30))
mlpModel.add(layers.Dense(10))
mlpModel.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)
#mlpModel.build()
#mlpModel.summary()
history = mlpModel.fit(x_train, mlp_train_labels, epochs=10, shuffle=True, validation_data=(x_test, mlp_test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def printConf(acc, labels, clusters=10):
    for i in range(clusters):
        for j in acc[i]:
            print(f"{j}\t", end="")
        print()
    
    for i in range(clusters):
        print(f"Cluster value for label {i}: {labels[i]}")


def defLabels(acc):
    labels = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9:[]}
    for key, val in acc.items():
        for i in range(len(val)):
            if val[i] == max(val):
                labels[key].append(i)
    
    return labels


# K-Means Classification
def kMeans(x, y):
    # Array for accuracy counts
    acc = {}
    kmeans = skc.KMeans(n_clusters=10, n_init=100, max_iter=1000, verbose=0).fit(x)

    for i in range(len(y)):
        if y[i] not in acc:
            acc[y[i]] = [0 for i in range(10)]
        acc[y[i]][kmeans.labels_[i]] += 1
    
    return acc

acc = kMeans(x, y)
labels = defLabels(acc)
printConf(acc, labels)

In [31]:
#ART
#This does not work very well

#Create a new starting model
def createModel():
    newModel = models.Sequential()
    newModel.add(layers.Dense(1, input_shape = np.shape(x_train[0]), kernel_initializer = "random_normal"))
    newModel.compile(
        optimizer='adam',
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=["accuracy"]
    )
    return newModel

#Make a list of models to compete with each other
modelList = []

#Make the first model
modelList.append(createModel())

#Evaluate the first input, training the model on it
centroids = []
clusters = []
output = modelList[0](np.asarray([x_train[0]]))
centroids.append(tf.keras.backend.get_value(output[0][0]))
clusters.append([0])
modelList[0].fit(np.asarray([x_train[0]]), np.asarray([centroids[0]]), verbose = 0)

#Do this for every input
vigilance = 0.4
for i in range(1, len(x_train)):
    currentFeature = x_train[i]
    foundModel = False

    #Find the best model for evaluating the current input
    minDistance = float('inf')
    bestModel = None
    bestModelIndex = -1
    for j in range(len(modelList)):
        currentModel = modelList[j]

        #Get an output
        output = currentModel(np.asarray([currentFeature]))
        output = tf.keras.backend.get_value(output[0][0])

        #Check if the distance between the output and the centroid is the lowest found
        if(abs(output - centroids[j]) < minDistance):
            minDistance = abs(output - centroids[j])
            bestModel = currentModel
            bestModelIndex = j

    #Check if the best model is close enough according to the vigilance
    if(minDistance < vigilance):
        #Update the centroid and cluster
        centroids[bestModelIndex] = ((centroids[bestModelIndex] * len(clusters[bestModelIndex])) + output) / (len(clusters[bestModelIndex]) + 1)
        clusters[bestModelIndex].append(i)

        #Train the model on the input
        bestModel.fit(np.asarray([currentFeature]), np.asarray([centroids[bestModelIndex]]), verbose = 0)

    #If it's not, make a new model for the unhandled value
    else:
        print("making new model")

        #Get the output from the new model
        newModel = createModel()
        output = newModel(np.asarray([currentFeature]))

        #Make new entries for the model
        modelList.append(newModel)
        centroids.append(tf.keras.backend.get_value(output[0][0]))
        clusters.append([i])

        #Train the model on it's new centroid
        newModel.fit(np.asarray([currentFeature]), np.asarray([tf.keras.backend.get_value(output[0][0])]), verbose = 0)

print(str(len(modelList)))

making new model
making new model
3


In [26]:
#Alternative ART
#Works ever so slightly better

#Make the model
artModel = models.Sequential()
artModel.add(layers.Dense(1, input_shape = np.shape(x_train[0]), kernel_initializer = "random_normal"))
artModel.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

for roundNum in range(100):

    #Start the list of centroids & clusters
    centroids = []
    clusters = []

    #Loop through every input
    vigilance = 0.4 * (roundNum + 1)
    for i in range(1, len(x_train)):
        currentFeature = x_train[i]

        #Get the output from the network 
        output = artModel(np.asarray([currentFeature]))
        output = tf.keras.backend.get_value(output[0][0])

        #Find the closest centroid
        minDistance = float('inf')
        bestClusterIndex = -1
        for j in range(len(centroids)):
            dist = abs(centroids[j] - output)
            if(dist < minDistance):
                minDistance = dist
                bestClusterIndex = j

        #Put the input in that cluster, if it's within vigilance distance
        if(minDistance < vigilance):
            centroids[bestClusterIndex] = ((centroids[bestClusterIndex] * len(clusters[bestClusterIndex])) + output) / (len(clusters[bestClusterIndex]) + 1)
            clusters[bestClusterIndex].append(i)

            #Also train the model on that centroid & input
            artModel.fit(np.asarray([currentFeature]), np.asarray([centroids[bestClusterIndex]]), verbose = 0)

        #If it's not within the vigilance distance, make a new cluster
        else:
            centroids.append(output)
            clusters.append([i])
            artModel.fit(np.asarray([currentFeature]), np.asarray([output]), verbose = 0)

    print("Completed round " + str(roundNum))

Completed round 0
Completed round 1
Completed round 2
Completed round 3
Completed round 4
Completed round 5
Completed round 6
Completed round 7
Completed round 8
Completed round 9
Completed round 10
Completed round 11
Completed round 12
Completed round 13
Completed round 14
Completed round 15
Completed round 16
Completed round 17
Completed round 18
Completed round 19
Completed round 20
Completed round 21
Completed round 22
Completed round 23
Completed round 24
Completed round 25
Completed round 26
Completed round 27
Completed round 28
Completed round 29
Completed round 30
Completed round 31
Completed round 32
Completed round 33
Completed round 34
Completed round 35
Completed round 36
Completed round 37
Completed round 38
Completed round 39
Completed round 40
Completed round 41
Completed round 42
Completed round 43
Completed round 44
Completed round 45
Completed round 46
Completed round 47
Completed round 48
Completed round 49
Completed round 50
Completed round 51
Completed round 52
Com

In [27]:
#ART printout
print("Printing " + str(len(clusters)) + " clusters")

#Initialize the cluster count list
clusterCounts = [[0 for i in range(10)] for i in range(len(clusters))]

#For each cluster, count the occurances of each label
for i in range(len(clusters)):
    for labelIndex in clusters[i]:
        clusterCounts[i][round(y_train[labelIndex])] += 1

for count in clusterCounts:
    print(count)

Printing 8 clusters
[10, 0, 26, 13, 19, 21, 0, 15, 42, 25]
[17, 0, 5, 20, 23, 0, 39, 22, 4, 13]
[6, 31, 9, 1, 1, 23, 0, 2, 1, 0]
[19, 0, 17, 43, 34, 9, 19, 39, 25, 36]
[0, 0, 0, 1, 5, 0, 18, 1, 1, 0]
[0, 28, 0, 0, 0, 1, 0, 0, 0, 0]
[27, 12, 25, 0, 1, 25, 0, 6, 6, 6]
[0, 7, 0, 0, 0, 0, 0, 0, 0, 0]
