In [1]:
from scipy.io import arff
import numpy as np

In [2]:
# # # # # # # # # # # # #
# function to obtain min and max values of all numeric features:
# # # # # # # # # # # # #
def getKeyMinMaxDict(data, meta):
    keyMinMaxDict = {}
    # loop over all features:
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'numeric':
            
            # estimate min and max:
            min = np.min(data[key])
            max = np.max(data[key])
            
            # persist min and max:
            minMaxDict = {}
            minMaxDict['min'] = min
            minMaxDict['max'] = max
            keyMinMaxDict[key] = minMaxDict
    return keyMinMaxDict

# # # # # # # # # # # # #
# function to normalize all numeric values of the data:
# requires a keyMinMaxDict provided by getKeyMinMaxDict
# # # # # # # # # # # # #
def normalizeNumericFeatures(data, meta, keyMinMaxDict):
    # loop over all features:
    for key in meta:
        # find all numeric features:
        if meta[str(key)][0] == 'numeric':

            # get the min and max value:
            min = keyMinMaxDict[key]['min']
            max = keyMinMaxDict[key]['max']
            
            # normalize the data for this feature:
            data[key] = np.subtract(data[key], min)
            data[key] = np.divide(data[key], (max-min))       
    
    return data, meta

# # # # # # # # # # # # #
# function to denormalize all numeric values of the data:
# requires a keyMinMaxDict provided by getKeyMinMaxDict
# # # # # # # # # # # # #
def denormalizeNumericFeatures(data, meta, keyMinMaxDict):
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'numeric':
            
            # get the min and max value:
            min = keyMinMaxDict[key]['min']
            max = keyMinMaxDict[key]['max']
            
            # denormalize:
            data[key] = np.multiply(data[key], (max-min))
            data[key] = np.add(data[key], min)
            
    return data, meta

# # # # # # # # # # # # #
# prints some descriptive statistics measures of the data's numeric features
# set maxKeys to -1 to print for all numeric features
# # # # # # # # # # # # #
def printNumericFeatureDescription(data, meta, maxKeys=1):
    keyCounter = 0
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'numeric':
            if keyCounter == maxKeys and maxKeys != -1:
                break
            keyCounter = keyCounter + 1
            print(key + ':')
            print('mean: ' + str(np.mean(data[key])))
            print('stdev: ' + str(np.std(data[key])))
            print('min: ' + str(np.min(data[key])))
            print('max: ' + str(np.max(data[key])))
            print()
            
# # # # # # # # # # # # #
# function to obtain categories of all nominal features:
# # # # # # # # # # # # #
def getKeyCategoriesDict(data, meta):
    keyCategoriesDict = {}
    # loop over all features:
    for key in meta:
        # find all nominal features:
        if meta[key][0] == 'nominal':
            keyCategoriesDict[key] = {}
            counter = 1
            for category in meta[key][1]:
                # assign a numeric value to each category
                # counter starts at 1 since 0 is reserved for missing values
                keyCategoriesDict[key][category] = counter
                counter = counter + 1
    return keyCategoriesDict

# # # # # # # # # # # # #
# function to encode nominal data with one-hot encoding:
# requires a keyCategoriesDict for consistent encoding
# # # # # # # # # # # # #
def normalizeNominalFeatures(data, meta, keyCategoriesDict):
    # loop over all features:
    for key in meta:
        # find all nominal features:
        if meta[key][0] == 'nominal':
            oneHotCodes = []
            # the length of the code
            # +1, since 0 will be reserved for missing values
            oneHotLength = len(keyCategoriesDict[key]) + 1
            for value in data[key]:
                category = str(value)[2:-1]
                # default for missing values is 0
                categoryToNumeric = 0
                if category in keyCategoriesDict[key]:
                    # if category exists in dictionary: obtain category number
                    categoryToNumeric = keyCategoriesDict[key][category]
                # construct one hot code with 1 at the correct position:
                oneHot = ['0'] * oneHotLength
                oneHot[categoryToNumeric] = '1'
                oneHot = ''.join(oneHot)
                oneHotCodes.append(oneHot)
            oneHotCodes = np.array(oneHotCodes)
            data[key] = oneHotCodes 
            
# # # # # # # # # # # # #
# prints some values measures of the data's nominal features
# set maxKeys to -1 to print for all nominal features
# # # # # # # # # # # # #
def printNominalFeatureDescription(data, meta, maxKeys=1):
    keyCounter = 0
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'nominal':
            if keyCounter == maxKeys and maxKeys != -1:
                break
            keyCounter = keyCounter + 1
            print(key + ':')
            print(data[key][1:20])
            print()

In [3]:
f = 'datasetsCBR/adult/adult.fold.000000.train.arff'
data, meta = arff.loadarff(f)

keyMinMaxDict = getKeyMinMaxDict(data, meta)
print('# # # numeric features before normalization: # # #')
printNumericFeatureDescription(data, meta)
data, meta = normalizeNumericFeatures(data, meta, keyMinMaxDict)
print('# # # numeric features after normalization: # # #')
printNumericFeatureDescription(data, meta)
data, meta = denormalizeNumericFeatures(data, meta, keyMinMaxDict)
print('# # # numeric features after denormalization: # # #')
printNumericFeatureDescription(data, meta)
data, meta = normalizeNumericFeatures(data, meta, keyMinMaxDict)

print('- ' * 40)
print()
print('# # # nominal features before normalization: # # #')
printNominalFeatureDescription(data, meta)
keyCategoriesDict = getKeyCategoriesDict(data, meta)
normalizeNominalFeatures(data, meta, keyCategoriesDict)
print('# # # numeric features after normalization: # # #')
printNominalFeatureDescription(data, meta)

# # # numeric features before normalization: # # #
age:
mean: 38.6138131853
stdev: 13.6789028108
min: 17.0
max: 90.0

# # # numeric features after normalization: # # #
age:
mean: 0.296079632676
stdev: 0.187382230284
min: 0.0
max: 1.0

# # # numeric features after denormalization: # # #
age:
mean: 38.6138131853
stdev: 13.6789028108
min: 17.0
max: 90.0

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# # # nominal features before normalization: # # #
workclass:
[b'Private' b'Private' b'Private' b'Private' b'Private' b'Private'
 b'Private' b'?' b'Private' b'Local-gov' b'Private' b'Private' b'?'
 b'Local-gov' b'Private' b'?' b'Local-gov' b'Private' b'Private']

# # # numeric features after normalization: # # #
workclass:
[b'010000000' b'010000000' b'010000000' b'010000000' b'010000000'
 b'010000000' b'010000000' b'100000000' b'010000000' b'000001000'
 b'010000000' b'010000000' b'100000000' b'000001000' b'010000000'
 b'100000000' b'000001000' b'010000000' b

In [26]:
def getFeatureDistances(p1, p2, meta):
    distances = []
    for i in range(0, len(p1)):
        n1 = p1[i]
        n2 = p2[i]
        if meta.types()[i] == 'numeric':
            distances.append(getNumericDistance(n1, n2))
        else:
            distances.append(getNominalDistance(n1, n2))
    return np.array(distances)
                

def getNumericDistance(n1, n2):
    return np.square(np.subtract(n1, n2))

def getNominalDistance(n1, n2):
    if n1 == n2:
        return 0
    return np.square(np.divide(2, len(n1)))

def getDistance(p1, p2, meta):
    return np.sum(getFeatureDistances(p1, p2, meta))

print('# # # Distance metrics # # #')
print('\nPoint data[0]:')
print(data[0])
print('\nPoint data[1]:')
print(data[1])

print('\ndistance(data[0], data[0]):')
distance = getDistance(data[0], data[0], meta)
print(distance)
print('\ndistance(data[0], data[1]):')
distance = getDistance(data[0], data[1], meta)
print(distance)
print('\ndistance(data[1], data[0]):')
distance = getDistance(data[1], data[0], meta)
print(distance)


minDistance = 99999999999
maxDistance = 0
minIndex = 0
maxIndex = 0
for i in range(1, len(data)):
    distance = getDistance(data[0], data[i], meta)
    if distance < minDistance:
        minDistance = distance
        minIndex= i
    if distance > maxDistance:
        maxDistance = distance
        maxIndex = i

print('\nNearest neighbor of data[0]:')
print(data[minIndex])
print('with a distance of:')
print(minDistance)
print('\nMost different from data[0]:')
print(data[maxIndex])
print('with a distance of:')
print(maxDistance)

# # # Distance metrics # # #

Point data[0]:
(0.6712328767123288, b'001000000', 0.10993934842688154, b'001000000000', 0.6, b'00010000', b'000010000000000', b'0000100', b'010000', b'010', 0.0, 0.0, 0.19387755102040816, b'01000000000000000000000000', b'001')

Point data[1]:
(0.6986301369863014, b'010000000', 0.13454771787039574, b'000010000000', 0.5333333333333333, b'01000000', b'000001000000000', b'0001000', b'010000', b'001', 0.0, 0.0, 0.3979591836734694, b'01000000000000000000000000', b'001')

distance(data[0], data[0]):
0.0

distance(data[0], data[1]):
0.730965308059

distance(data[1], data[0]):
0.730965308059

Nearest neighbor of data[0]:
(0.6438356164383562, b'001000000', 0.023798554239690416, b'001000000000', 0.6, b'00010000', b'000000100000000', b'0000100', b'010000', b'010', 0.0, 0.0, 0.3469387755102041, b'01000000000000000000000000', b'001')
with a distance of:
0.0493763625138

Most different from data[0]:
(0.3424657534246575, b'000100000', 0.129183453249578, b'000001000000', 0