In [1]:
from scipy.io import arff
import numpy as np
import time

In [2]:
# # # # # # # # # # # # #
# function to obtain min and max values of all numeric features:
# # # # # # # # # # # # #
def getKeyMinMaxDict(data, meta):
    keyMinMaxDict = {}
    # loop over all features:
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'numeric':
            
            # estimate min and max:
            min = np.min(data[key])
            max = np.max(data[key])
            
            # persist min and max:
            minMaxDict = {}
            minMaxDict['min'] = min
            minMaxDict['max'] = max
            keyMinMaxDict[key] = minMaxDict
    return keyMinMaxDict

# # # # # # # # # # # # #
# function to normalize all numeric values of the data:
# requires a keyMinMaxDict provided by getKeyMinMaxDict
# # # # # # # # # # # # #
def normalizeNumericFeatures(data, meta, keyMinMaxDict):
    # loop over all features:
    for key in meta:
        # find all numeric features:
        if meta[str(key)][0] == 'numeric':

            # get the min and max value:
            min = keyMinMaxDict[key]['min']
            max = keyMinMaxDict[key]['max']
            
            # normalize the data for this feature:
            data[key] = np.subtract(data[key], min)
            data[key] = np.divide(data[key], (max-min))       

# # # # # # # # # # # # #
# function to denormalize all numeric values of the data:
# requires a keyMinMaxDict provided by getKeyMinMaxDict
# # # # # # # # # # # # #
def denormalizeNumericFeatures(data, meta, keyMinMaxDict):
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'numeric':
            
            # get the min and max value:
            min = keyMinMaxDict[key]['min']
            max = keyMinMaxDict[key]['max']
            
            # denormalize:
            data[key] = np.multiply(data[key], (max-min))
            data[key] = np.add(data[key], min)

# # # # # # # # # # # # #
# prints some descriptive statistics measures of the data's numeric features
# set maxKeys to -1 to print for all numeric features
# # # # # # # # # # # # #
def printNumericFeatureDescription(data, meta, maxKeys=1):
    keyCounter = 0
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'numeric':
            if keyCounter == maxKeys and maxKeys != -1:
                break
            keyCounter = keyCounter + 1
            print(key + ':')
            print('mean: ' + str(np.mean(data[key])))
            print('stdev: ' + str(np.std(data[key])))
            print('min: ' + str(np.min(data[key])))
            print('max: ' + str(np.max(data[key])))
            print()
            
# # # # # # # # # # # # #
# function to obtain categories of all nominal features:
# # # # # # # # # # # # #
def getKeyCategoriesDict(data, meta):
    keyCategoriesDict = {}
    # loop over all features:
    for key in meta:
        # find all nominal features:
        if meta[key][0] == 'nominal':
            keyCategoriesDict[key] = {}
            counter = 1
            for category in meta[key][1]:
                # assign a numeric value to each category
                # counter starts at 1 since 0 is reserved for missing values
                keyCategoriesDict[key][category] = counter
                counter = counter + 1
    return keyCategoriesDict

# # # # # # # # # # # # #
# function to encode nominal data with one-hot encoding:
# requires a keyCategoriesDict for consistent encoding
# # # # # # # # # # # # #
def normalizeNominalFeatures(data, meta, keyCategoriesDict):
    # loop over all features:
    for key in meta:
        # find all nominal features:
        if meta[key][0] == 'nominal':
            oneHotCodes = []
            # the length of the code
            # +1, since 0 will be reserved for missing values
            oneHotLength = len(keyCategoriesDict[key]) + 1
            for value in data[key]:
                category = str(value)[2:-1]
                # default for missing values is 0
                categoryToNumeric = 0
                if category in keyCategoriesDict[key]:
                    # if category exists in dictionary: obtain category number
                    categoryToNumeric = keyCategoriesDict[key][category]
                # construct one hot code with 1 at the correct position:
                oneHot = ['0'] * oneHotLength
                oneHot[categoryToNumeric] = '1'
                oneHot = ''.join(oneHot)
                oneHotCodes.append(oneHot)
            oneHotCodes = np.array(oneHotCodes)
            data[key] = oneHotCodes 
            
# # # # # # # # # # # # #
# prints some values measures of the data's nominal features
# set maxKeys to -1 to print for all nominal features
# # # # # # # # # # # # #
def printNominalFeatureDescription(data, meta, maxKeys=1):
    keyCounter = 0
    for key in meta:
        # find all numeric features:
        if meta[key][0] == 'nominal':
            if keyCounter == maxKeys and maxKeys != -1:
                break
            keyCounter = keyCounter + 1
            print(key + ':')
            print(data[key][1:20])
            print()

In [3]:
f = 'datasetsCBR/adult/adult.fold.000000.train.arff'
data, meta = arff.loadarff(f)

print(meta['class'])

keyMinMaxDict = getKeyMinMaxDict(data, meta)
print('# # # numeric features before normalization: # # #')
printNumericFeatureDescription(data, meta)
normalizeNumericFeatures(data, meta, keyMinMaxDict)
print('# # # numeric features after normalization: # # #')
printNumericFeatureDescription(data, meta)
denormalizeNumericFeatures(data, meta, keyMinMaxDict)
print('# # # numeric features after denormalization: # # #')
printNumericFeatureDescription(data, meta)
normalizeNumericFeatures(data, meta, keyMinMaxDict)

print('- ' * 40)
print()
print('# # # nominal features before normalization: # # #')
printNominalFeatureDescription(data, meta)
keyCategoriesDict = getKeyCategoriesDict(data, meta)
normalizeNominalFeatures(data, meta, keyCategoriesDict)
print('# # # numeric features after normalization: # # #')
printNominalFeatureDescription(data, meta)

('nominal', ('>50K', '<=50K'))
# # # numeric features before normalization: # # #
age:
mean: 38.6138131853
stdev: 13.6789028108
min: 17.0
max: 90.0

# # # numeric features after normalization: # # #
age:
mean: 0.296079632676
stdev: 0.187382230284
min: 0.0
max: 1.0

# # # numeric features after denormalization: # # #
age:
mean: 38.6138131853
stdev: 13.6789028108
min: 17.0
max: 90.0

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# # # nominal features before normalization: # # #
workclass:
[b'Private' b'Private' b'Private' b'Private' b'Private' b'Private'
 b'Private' b'?' b'Private' b'Local-gov' b'Private' b'Private' b'?'
 b'Local-gov' b'Private' b'?' b'Local-gov' b'Private' b'Private']

# # # numeric features after normalization: # # #
workclass:
[b'010000000' b'010000000' b'010000000' b'010000000' b'010000000'
 b'010000000' b'010000000' b'100000000' b'010000000' b'000001000'
 b'010000000' b'010000000' b'100000000' b'000001000' b'010000000'
 b'1000000

In [4]:
def dataToCaseAndTestBase(dataset, s):
    
    f = 'datasetsCBR/' + dataset + '/' + dataset
    f += '.fold.{:0>6d}'.format(s)
    fTrain = f + '.train.arff'
    fTest = f + '.test.arff'
    dataTrain, metaTrain = arff.loadarff(fTrain)
    dataTest, metaTest = arff.loadarff(fTest)
    
    keyMinMaxDict = getKeyMinMaxDict(dataTrain, metaTrain)
    normalizeNumericFeatures(dataTrain, metaTrain, keyMinMaxDict)
    normalizeNumericFeatures(dataTest, metaTest, keyMinMaxDict)
    keyCategoriesDict = getKeyCategoriesDict(dataTrain, metaTrain)
    normalizeNominalFeatures(dataTrain, metaTrain, keyCategoriesDict)
    normalizeNominalFeatures(dataTest, metaTest, keyCategoriesDict)
    
    CBproblems = []
    CBsolutions = []
    for rowIndex in range(0, len(dataTrain)-1):
        problemRow = []
        row = dataTrain[rowIndex]
        for colIndex in range(0,len(row)-1):
            value = row[colIndex]
            if metaTrain.types()[colIndex] == 'numeric':
                problemRow.append(value)
            else:
                for c in str(value)[2:-1]:
                    if int(c) == 0:
                        problemRow.append(0)
                    else:
                        problemRow.append(np.divide(1, np.sqrt(2)))
        solutionRow = []
        for colIndex in range(len(row)-1,len(row)):
            value = row[colIndex]
            for c in str(value)[2:-1]:
                if int(c) == 0:
                    solutionRow.append(0)
                else:
                    solutionRow.append(np.divide(1, np.sqrt(2)))
        
        CBproblems.append(problemRow)
        CBsolutions.append(solutionRow)
        
    CBproblems = np.array(CBproblems)
    CBsolutions = np.array(CBsolutions)
    
    TCproblems = []
    TCsolutions = []
    for rowIndex in range(0, len(dataTest)-1):
        problemRow = []
        row = dataTest[rowIndex]
        for colIndex in range(0,len(row)-1):
            value = row[colIndex]
            if metaTest.types()[colIndex] == 'numeric':
                problemRow.append(value)
            else:
                for c in str(value)[2:-1]:
                    if int(c) == 0:
                        problemRow.append(0)
                    else:
                        problemRow.append(np.divide(1, np.sqrt(2)))
        solutionRow = []
        for colIndex in range(len(row)-1,len(row)):
            value = row[colIndex]
            for c in str(value)[2:-1]:
                if int(c) == 0:
                    solutionRow.append(0)
                else:
                    solutionRow.append(np.divide(1, np.sqrt(2)))
        
        TCproblems.append(problemRow)
        TCsolutions.append(solutionRow)
        
    TCproblems = np.array(TCproblems)
    TCsolutions = np.array(TCsolutions)
    
    return (CBproblems, CBsolutions, TCproblems, TCsolutions)

(CBproblems, CBsolutions, TCproblems, TCsolutions) = dataToCaseAndTestBase('adult', 0)

print(CBproblems.shape)
print(CBsolutions.shape)
print(TCproblems.shape)
print(TCsolutions.shape)

(43957, 92)
(43957, 3)
(4883, 92)
(4883, 3)


In [39]:
def getDistance(p1, p2):
    return np.sum(np.square(np.abs(np.subtract(p1, p2))))

print(getDistance(CBproblems[0], CBproblems[0]))
print(getDistance(CBproblems[1], CBproblems[0]))
print(getDistance(CBproblems[0], CBproblems[1]))

start = time.time()
for j in range(0,1):
    for i in range(0, 1):
        getDistance(CBproblems[i], CBproblems[i])
end = time.time()
print(end-start)

0.0
6.04744993895
6.04744993895
7.987022399902344e-05


In [6]:
def getKnn(CBproblems, problem, k):
    knnIndices = [0] * k
    knnDistances = [float("inf")] * k
    
    for otherProblemIndex in range(0, len(CBproblems)):
        otherProblem = CBproblems[otherProblemIndex]
        distance = getDistance(problem, otherProblem)
        if distance < knnDistances[-1]:
            # find correct location in sortedList of knnIndices:
            tempIndices = []
            tempDistances = []
            for i in range(0,k):
                if knnDistances[i] > distance:
                    # insert newly found point at correct location
                    tempIndices.append(otherProblemIndex)
                    tempDistances.append(distance)
                    for j in range (i,k):
                        tempIndices.append(knnIndices[j])
                        tempDistances.append(knnDistances[j])
                else:
                    tempIndices.append(knnIndices[i])
                    tempDistances.append(knnDistances[i])
            # cut off the result to correct length again
            knnIndices = tempIndices[:k]
            knnDistances = tempDistances[:k]
            
    return knnIndices, knnDistances

In [30]:
def acbrAlgorithm(dataset, fold, k=5, alpha=0.2):
    (CBproblems, CBsolutions, TCproblems, TCsolutions) = dataToCaseAndTestBase(dataset, fold)
    goodnesses = [0.5] * len(CBproblems)
    goodnesses = np.array(goodnesses)
    CM = [goodnesses]
    for j in range(0, len(TCproblems)):
        cNew = TCproblems[j]
        K = acbrRetrievalPhase(CBproblems, cNew, k)
        cSol = acbrReusePhase(cNew, K, CBsolutions)
        acbrRevisionPhase()
        CBproblems, CBsolutions, newGoodnesses, CM = acbrReviewPhase(CBproblems, CBsolutions, K, TCsolutions[j], CM, alpha)
    return CM, CBproblems, CBsolutions

def acbrRetrievalPhase(CBproblems, cNew, k):
    return getKnn(CBproblems, cNew, k)[0]

def acbrReusePhase(cNew, K, CBsolutions):
    return CBsolutions[K[0]]

def acbrRevisionPhase():
    pass

def acbrReviewPhase(CBproblems, CBsolutions, K, cNewClass, CM, alpha):
    lastGoodnesses = CM[-1]
    newGoodnesses = []
    for goodness in lastGoodnesses:
        newGoodnesses.append(goodness)
    for k in K:
        cKClass = CBsolutions[k]
        r = getDistance(cKClass, cNewClass)
        g = lastGoodnesses[k]
        newGoodnesses[k] = g + alpha * (r - g)
    newGoodnesses = np.array(newGoodnesses)
    CBproblems, CBsolutions, newGoodnesses, CM = oblivionByGoodnessFS(K, CM, CBproblems, CBsolutions, newGoodnesses)
    CM.append(newGoodnesses)
    CM = [CM[0], newGoodnesses]
    return CBproblems, CBsolutions, newGoodnesses, CM
    
def oblivionByGoodnessFS(K, CM, CBproblems, CBsolutions, newGoodnesses):
    firstGoodnesses = CM[0]
    deleteRows = []
    for k in K:
        if newGoodnesses[k] < firstGoodnesses[k]:
            deleteRows.append(k)
    CBproblems = np.delete(CBproblems, deleteRows, axis=0)
    CBsolutions = np.delete(CBsolutions, deleteRows, axis=0)
    newGoodnesses = np.delete(newGoodnesses, deleteRows, axis=0)
    CM[0] = np.delete(CM[0], deleteRows, axis=0)
    
    return CBproblems, CBsolutions, newGoodnesses, CM

In [None]:
def crossValidation(dataset, folds):
    for s in range(0, folds):
        print('Fold ' + str(s+1) + '...')
        start = time.time()
        CM, CBproblems, CBsolutions = acbrAlgorithm(dataset, s)
        end = time.time()
        print('acbrAlgorithm terminated after ' +str(end-start))
        print(CM[1].shape)
        print('- ' * 30)
        
crossValidation('pen-based', 10)

Fold 1...
