In [12]:
# How To Implement Learning Vector Quantization (LVQ) From Scratch With Python
# https://machinelearningmastery.com/implement-learning-vector-quantization-scratch-python/

# LVQ for the Ionosphere Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	n = 0
	for fold in folds:
		n += 1
		print("Training fold", n, "of", n_folds)
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
		print("Fold accuracy =", accuracy)
	return scores

# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

# Locate the best matching unit
def get_best_matching_unit(codebooks, test_row):
	distances = list()
	for codebook in codebooks:
		dist = euclidean_distance(codebook, test_row)
		distances.append((codebook, dist))
	distances.sort(key=lambda tup: tup[1])
	return distances[0][0]

# Make a prediction with codebook vectors
def predict(codebooks, test_row):
	bmu = get_best_matching_unit(codebooks, test_row)
	return bmu[-1]

# Create a random codebook vector
def random_codebook(train):
	n_records = len(train)
	n_features = len(train[0])
	codebook = [train[randrange(n_records)][i] for i in range(n_features)]
	return codebook

# Train a set of codebook vectors
def train_codebooks(train, n_codebooks, lrate, epochs):
	codebooks = [random_codebook(train) for i in range(n_codebooks)]
	for epoch in range(epochs):
		rate = lrate * (1.0-(epoch/float(epochs)))
		for row in train:
			bmu = get_best_matching_unit(codebooks, row)
			for i in range(len(row)-1):
				error = row[i] - bmu[i]
				if bmu[-1] == row[-1]:
					bmu[i] += rate * error
				else:
					bmu[i] -= rate * error
	return codebooks

# LVQ Algorithm
def learning_vector_quantization(train, test, n_codebooks, lrate, epochs):
	codebooks = train_codebooks(train, n_codebooks, lrate, epochs)
	predictions = list()
	for row in test:
		output = predict(codebooks, row)
		predictions.append(output)
	return(predictions)

# Test LVQ on Ionosphere dataset
seed(1)
# load and prepare data
filename = 'ionosphere.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds_original = 5
learn_rate_original = 0.3
n_epochs_original = 50
n_codebooks_original = 20
scores = evaluate_algorithm(dataset, learning_vector_quantization, n_folds_original, n_codebooks_original, learn_rate_original, n_epochs_original)
#print('Scores: %s' % scores)
mean_accuracy_original = sum(scores)/float(len(scores))
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Training fold 1 of 5
Fold accuracy = 88.57142857142857
Training fold 2 of 5
Fold accuracy = 90.0
Training fold 3 of 5
Fold accuracy = 88.57142857142857
Training fold 4 of 5
Fold accuracy = 88.57142857142857
Training fold 5 of 5
Fold accuracy = 80.0
Mean Accuracy: 87.143%


# Modificado

In [19]:

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	n = 0
	for fold in folds:
		n += 1
		print("Training fold", n, "of", n_folds)
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
		print("Fold accuracy =", accuracy)
	return scores

# calculate the manhattan distance between two vectors
def manhattan_distance(row1,row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += abs(row1[i] - row2[i])
    return distance

# Locate the best matching unit
def get_best_matching_unit(codebooks, test_row):
	distances = list()
	for codebook in codebooks:
		dist = manhattan_distance(codebook, test_row)
		distances.append((codebook, dist))
	distances.sort(key=lambda tup: tup[1])
	return distances[0][0]

# Make a prediction with codebook vectors
def predict(codebooks, test_row):
	bmu = get_best_matching_unit(codebooks, test_row)
	return bmu[-1]

# Create a random codebook vector
def random_codebook(train):
	n_records = len(train)
	n_features = len(train[0])
	codebook = [train[randrange(n_records)][i] for i in range(n_features)]
	return codebook

# Train a set of codebook vectors
def train_codebooks(train, n_codebooks, lrate, epochs):
	codebooks = [random_codebook(train) for i in range(n_codebooks)]
	for epoch in range(epochs):
		rate = lrate * (1.0-(epoch/float(epochs)))
		for row in train:
			bmu = get_best_matching_unit(codebooks, row)
			for i in range(len(row)-1):
				error = row[i] - bmu[i]
				if bmu[-1] == row[-1]:
					bmu[i] += rate * error
				else:
					bmu[i] -= rate * error
	return codebooks

# LVQ Algorithm
def learning_vector_quantization(train, test, n_codebooks, lrate, epochs):
	codebooks = train_codebooks(train, n_codebooks, lrate, epochs)
	predictions = list()
	for row in test:
		output = predict(codebooks, row)
		predictions.append(output)
	return(predictions)



# evaluate algorithm
n_folds = 6
learn_rate = 0.1
n_epochs = 200
n_codebooks = 30
dataset = load_csv("Undergraduate Admission Test Survey in Bangladesh.csv")
dataset = dataset[1:]
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

scores = evaluate_algorithm(dataset, learning_vector_quantization, n_folds, n_codebooks, learn_rate, n_epochs)
#print('Scores: %s' % scores)
mean_accuracy = sum(scores)/float(len(scores))
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Training fold 1 of 6
Fold accuracy = 42.0
Training fold 2 of 6
Fold accuracy = 55.00000000000001
Training fold 3 of 6
Fold accuracy = 46.0
Training fold 4 of 6
Fold accuracy = 46.0
Training fold 5 of 6
Fold accuracy = 38.0
Training fold 6 of 6
Fold accuracy = 56.99999999999999
Mean Accuracy: 47.333%


In [20]:
print("Modelo original")
print("Accuracy original:", mean_accuracy_original, "Hiperparámetros:", "n_folds:", n_folds_original, "learn_rate:", learn_rate_original, "n_epochs:", n_epochs_original, "n_codebooks:", n_codebooks_original)
print("Modelo nuevo")
print("Accuracy nuevo:", mean_accuracy, "Hiperparámetros:", "n_folds:", n_folds, "learn_rate:", learn_rate, "n_epochs:", n_epochs, "n_codebooks:", n_codebooks)

Modelo original
Accuracy original: 87.14285714285714 Hiperparámetros: n_folds: 5 learn_rate: 0.3 n_epochs: 50 n_codebooks: 20
Modelo nuevo
Accuracy nuevo: 47.333333333333336 Hiperparámetros: n_folds: 6 learn_rate: 0.1 n_epochs: 200 n_codebooks: 30


# Conclusión:

El dataset usado es un dataset para predecir en que tipo de universidad quedo un estudiante de reino unido, si a una escuela privada o a una escuela publica. Lo cual hace el problema aun más dificil, pues no solo depende de factores del pasado o actividades, a veces esto puede ser cuestión de suerte.

La primera modificación fue cambiar la función de distancia de Euclidean a **Manhattan**. Este ajuste fue rápido de implementar. Sin embargo es importante notar que esta métrica es una de las más influyentes en un modelo como este, pues es la manera en la que evalua sus soluciones y la manera en la que entrena.

En cuanto a la arquitectura, incrementé el número de **codebooks** a 30. La modificación de este parámetro puede mejorar la precisión del modelo, ya que un mayor número de codebooks permite una mejor representación del espacio de características, sin embargo reduce la velocidad de computo del modelo, asi como puede llevar a sobreentrenamiento.

De igual manera, ajusté la tasa de aprendizaje y el número de épocas. Aumentar las épocas a 200 y reducir la tasa de aprendizaje a 0.1 hizo que el proceso fuera más lento a la hora de entrenar, pero se permitio que el modelo entrenara un mayor numero de veces.

Esta actividad me ayudó a entender el algoritmo. Asi como aprendí como el cambiar los hiperparametros una vez que entiendes el algoritmo es importante para obtener mejores resultados. Asi como es importante balancear el rendimiento del modelo, asi como su coste computacional