## General Functions

In [47]:
import os, struct
import matplotlib as plt
import numpy as np
import numpy.linalg as LA
import pandas as pd
from pylab import *
import random
import operator

def readExcelSheet1(excelfile):
    from pandas import read_excel
    return (read_excel(excelfile)).values

#This function is used in the function readExcel(...) defined further below
def readExcelRange(excelfile,sheetname="Sheet1",startrow=1,endrow=1,startcol=1,endcol=1):
    from pandas import read_excel
    values=(read_excel(excelfile, sheetname,header=None)).values;
    return values[startrow-1:endrow,startcol-1:endcol]

#This is the function you can actually use within your program.
#See manner of usage further below in the section "Prepare Data"

def readExcel(excelfile,**args):
    if args:
        data=readExcelRange(excelfile,**args)
    else:
        data=readExcelSheet1(excelfile)
    if data.shape==(1,1):
        return data[0,0]
    elif (data.shape)[0]==1:
        return data[0]
    else:
        return data

def writeExcelData(x,excelfile,sheetname,startrow,startcol):
    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook
    df=DataFrame(x)
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl') 
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df.to_excel(writer, sheet_name=sheetname,startrow=startrow-1, startcol=startcol-1, header=False, index=False)
    writer.save()
    writer.close()

def getSheetNames(excelfile):
    from pandas import ExcelFile
    return (ExcelFile(excelfile)).sheet_names
sheetname = 'Results'
startcol = 2
excelfile=r"/Volumes/Macintosh HD/Users/louisecabansay/Dropbox (Personal)/UBX - Machine Learning w: Python/FinalProject/KmeansResults.xlsx";



In [48]:
def TestTrainDataSplit(dataset, split):#dataset = full dataset; 
    #split = percent of dataset to be training set (enter as decimal)
    trainingSet = []
    testSet = []
    for x in range(len(dataset)):
            dataset[x] = dataset[x]
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])
    return np.array(trainingSet), np.array(testSet)


## PCA Functions

In [49]:
def XUZCVPR(dataset):
    X = dataset
    Uvector = np.mean(X,axis=0)
    U = np.array([Uvector])
    Z = X - U
    meanZ = np.mean(Z,axis=0) # axis to calculate column means, should be 0
    meanZround = [round(x) for x in meanZ]
    emptymeanZ=filter(lambda x:x != 0, meanZround) # all the column mean of z should be 0
    
    C = np.cov(Z,rowvar=False)
    Ctranspose = C.transpose()
    checkC = np.array_equal(C,Ctranspose)
    
    aEighV=LA.eigh(C)#descending
    V = np.flipud((aEighV[1].T))
    Evals = np.flipud(aEighV[0])
    Vrows = V[0,:]
    checkVrows = (np.dot(C, Vrows))/(Evals[0]*Vrows)
    
    P=np.dot(Z,V.T)
    R=np.dot(P,V)
    Xrec = R+U
    print 'X-shape: ' +repr(X.shape)
    print 'U-shape: ' +repr(U.shape)
    print 'Z-shape: ' +repr(Z.shape)
    print 'C-shape: ' +repr(C.shape)
    print 'V-shape: ' +repr(V.shape)
    print 'P-shape: ' +repr(P.shape)
    print 'R-shape: ' +repr(R.shape)
    print 'Xrec-shape: '+ repr(Xrec.shape)
    print 'meanZround: ' + repr(meanZround)
    print 'emptymeanZ: ' + repr(emptymeanZ)
    print 'C equals C.T : ' + repr(checkC)
    print 'Rows are eigenvectors if values are 1: ' + repr(checkVrows)    
    print 'Note: Eigenvectors and values returned in order most to least importance'
    return np.array(X), np.array(U), np.array(Z), np.array(C), np.array(V), np.array(Evals), np.array(P), np.array(R), np.array(Xrec)

def DimensionReduction(X, P, V, U):
    reducedDims = []
    Xdiffavg = []
    for d in range(len(U.T)):
        i = d+1
        Xrec = (np.dot(P[:,0:i],V[0:i,:]))+U
        reducedDims.append(np.array(Xrec))

    for m in range(len(U.T)):
        Xdiffnorms = []
        for w in range(len(X)):
            tXdim = reducedDims[m][w]
            Xdiffs = X[w]-tXdim
            normXdiff = LA.norm(Xdiffs)
            Xdiffnorms.append(normXdiff)
        meanXdiff = np.mean(Xdiffnorms)
        Xdiffavg.append(meanXdiff)
    for a in range(len(Xdiffavg)):
        print 'Using '+repr(a+1)+' principle component(s) the average difference between X and Xrec is '+repr(Xdiffavg[a])    
    return np.array(Xdiffavg)

## Bayesian Functions

In [50]:
def pdf(x,mu,sigma):
    #print x
    #print mu
    xf = x.astype(float)
    muf = mu.astype(float)

    d=np.alen(muf)
    dfact1=(2*np.pi)**d
    dfact2=LA.det(sigma)
    fact=1/np.sqrt(dfact1*dfact2)
    xc=xf-muf
    isigma=LA.inv(sigma)
#    isigxc = np.dot(isigma,xc.T)
#    ex = np.dot(xc,isigxc)
    npdf = fact * np.exp(-0.5 * np.einsum('ij,jk,ik->i',xc,isigma,xc))
    return npdf  

In [51]:
def BuildNDBayesianClassifier(Dataset, D, Classlabels):
    ClassStats = {}
    for n in range(len(Classlabels)):
        ClassStats[Classlabels[n]]={}
        Class = Dataset[Dataset[:,-1] == Classlabels[n]]
        ClassData = Class[:,:D]
        ClassStats[Classlabels[n]]['Num'] = len(Class)
        ClassStats[Classlabels[n]]['Data'] = ClassData
        ClassStats[Classlabels[n]]['Mean'] = np.mean(ClassData,axis=0)
        ClassStats[Classlabels[n]]['Cov'] = np.cov(ClassData, rowvar=False)
    return ClassStats
    

def ApplyNDBayesianClassifier(TrainDataset, TestDataset, D, Classlabels):
    ClassStats = BuildNDBayesianClassifier(TrainDataset, D, Classlabels)
    w=1; #width of the bin
    CountC_all = []
    for n in range(len(Classlabels)):
        NC = ClassStats[Classlabels[n]]['Num']
        UC = ClassStats[Classlabels[n]]['Mean']
        covC = ClassStats[Classlabels[n]]['Cov']
        countC = NC*w*pdf(TestDataset[:,:D], UC, covC)
        CountC_all.append(countC)
    [resultlabel, resultprob]= ResultLPBayesClassifier(CountC_all, TestDataset, Classlabels)
    return np.array([resultlabel, resultprob])

def ResultLPBayesClassifier(CountC_all, TestDataset, Classlabels):
    ClassCounts_all = np.array(CountC_all)
    resultlabel = np.full(np.alen(TestDataset), "Indeterminate", dtype=object)
    resultprob = np.full(np.alen(TestDataset), 0 , dtype=float)
    for g in range(len(TestDataset)):
        CountXvalues = []
        for w in range(len(Classlabels)):
            count = ClassCounts_all[w][g]
            CountXvalues.append(count)
        max_value = max(CountXvalues)
        max_index = CountXvalues.index(max_value)
        label = Classlabels[max_index]
        resultlabel[g]=label
        #print sum(ClassCounts_all)
        resultprob = (ClassCounts_all[max_index][g]).astype('float')/sum(ClassCounts_all)
    return resultlabel, resultprob
        

## Classifier Evaluation Functions

In [52]:
def PerformanceMetrics(Resultlabels, Dataset, PositiveLabel,Classlabels):
    OutputCL = (Resultlabels).astype('str')
    GroundTruth = (Dataset[:,-1]).astype('str')
    Classcomps = OutputCL == GroundTruth
        
    TrueP=0
    FalseP=0
    TrueN=0
    FalseN=0
    Num=0
    for i in range(len(GroundTruth)):
        if Classcomps[i]== True:
            if OutputCL[i] == PositiveLabel:
                TrueP+=1
            else:
                TrueN+=1
        elif Classcomps[i]==False:
            if OutputCL[i] != PositiveLabel:
                FalseN+=1
            else:
                FalseP+=1
    Accuracy = float((TrueP+TrueN))/(TrueP+TrueN+FalseP+FalseN)
    Sensitivity = float((TrueP))/(TrueP+FalseN)
    Specificity = float((TrueN))/(FalseP+TrueN)
    PPV = float((TrueP))/(FalseP+TrueP)
    stringmeasures = ['TrueP', 'FalseP', 'TrueN', 'FalseN', 'Accuracy', 'Sensitivity', 'Specificity', 'PPV']
    measures_values = [TrueP, FalseP, TrueN, FalseN, Accuracy, Sensitivity, Specificity, PPV]
    #print 'Classifier Performance:'
    print '     Positive Class Label: '+ repr(PositiveLabel)
    for i in range(len(stringmeasures)):
        print '         '+stringmeasures[i]+ ': '+repr(measures_values[i])
    
    return [TrueP, FalseP, TrueN, FalseN, Accuracy, Sensitivity, Specificity, PPV]


def BayesPCAperformance(TestDataset, TrainDataset, Classlabels, dimensions):
    BayesPCAperformance =[]
    DResults=EvaluateBayesPCA(TrainDataset, TestDataset, dimensions, Classlabels)
    for i in range(len(DResults)):
        print repr(i+2)+' Principal Components Bayes Classifier Performance:'
        labelstring = []
        reallabelstring = []
        OutputCL = (DResults[i][0]).astype('str')
        realCL=(TrainDataset[:,-1]).astype('str')
        for j in range(len(Classlabels)):
            labelnum = OutputCL[OutputCL == Classlabels[j]]
            lstring = repr(Classlabels[j])+ ': '+ repr(len(labelnum))
            labelstring.append(lstring)
            reallabelnum = realCL[realCL == Classlabels[j]]
            reallstring = repr(Classlabels[j])+ ': '+ repr(len(reallabelnum))
            reallabelstring.append(reallstring)
        print '  Training Output ->' + repr(reallabelstring) + ' Total: ' +repr(len(realCL))
        print '  Testing Output -->' + repr(labelstring)+ ' Total: ' +repr(len(OutputCL))
        for cl in range(len(Classlabels)):
            PositiveLabel=Classlabels[cl]
            nDBayesPerformance = PerformanceMetrics(DResults[i][0], TestDataset, PositiveLabel, Classlabels)
            BayesPCAperformance.append(nDBayesPerformance)
        print '\n'
    return np.array(BayesPCAperformance)

def EvaluateBayesPCA(TrainDataset, TestDataset, dimensions, Classlabels):
    DBayesPCAResults=[]
    for d in range(dimensions-1):
        D=d+2
        nBResults = ApplyNDBayesianClassifier(TrainDataset, TestDataset, D, Classlabels)
        DBayesPCAResults.append(nBResults)
    return np.array(DBayesPCAResults)

        

## Load Data and Subset Training and Testing Sets

In [53]:
#2C data path
v2C=r"/Volumes/Macintosh HD/Users/louisecabansay/Dropbox (Personal)/UBX - Machine Learning w: Python/FinalProject/vertebral_2C.xlsx"
#3C data path
v3C =r"/Volumes/Macintosh HD/Users/louisecabansay/Dropbox (Personal)/UBX - Machine Learning w: Python/FinalProject/vertebral_3C.xlsx"
#import matrices with class labels
vdata2C=np.array(readExcel(v2C))
vdata3C=np.array(readExcel(v3C))

#set random seed to maintain results of single run
random.seed(678)

#call function to split given dataset into training and testing sets
#note: not used for K-means as training/testing sets not needed
[V2TrainDataset, V2TestDataset]=TestTrainDataSplit(vdata2C, 0.90)
[V3TrainDataset, V3TestDataset]=TestTrainDataSplit(vdata3C, 0.90)

#check training and test set size
print V2TrainDataset.shape, V2TestDataset.shape
print V3TrainDataset.shape, V3TestDataset.shape


(281, 7) (29, 7)
(287, 7) (23, 7)


## Run XUZCVPR and find principal components

In [54]:
V2data = vdata2C[:,:-1]
[V2X, V2U, V2Z, V2C, V2V, Evals, V2P, V2R, V2Xrec] = XUZCVPR(V2data)

X-shape: (310, 6)
U-shape: (1, 6)
Z-shape: (310, 6)
C-shape: (6, 6)
V-shape: (6, 6)
P-shape: (310, 6)
R-shape: (310, 6)
Xrec-shape: (310, 6)
meanZround: [-0.0, -0.0, -0.0, 0.0, -0.0, -0.0]
emptymeanZ: []
C equals C.T : True
Rows are eigenvectors if values are 1: array([ 1.,  1.,  1.,  1.,  1.,  1.])
Note: Eigenvectors and values returned in order most to least importance


In [55]:
Xdiffavg= DimensionReduction(V2X, V2P, V2V, V2U)

Using 1 principle component(s) the average difference between X and Xrec is 23.444807014459712
Using 2 principle component(s) the average difference between X and Xrec is 16.725855662046349
Using 3 principle component(s) the average difference between X and Xrec is 11.808577465741214
Using 4 principle component(s) the average difference between X and Xrec is 6.9252607750171045
Using 5 principle component(s) the average difference between X and Xrec is 0.0018012649481673707
Using 6 principle component(s) the average difference between X and Xrec is 2.0453270783353266e-14


## Run and Evaluate Binary Bayesian Classifier with d Principal Components

In [56]:
V2Classlabels = ['NO','AB']
dimensions = 6
V2BayesPCAperformance = BayesPCAperformance(V2TestDataset, V2TrainDataset, V2Classlabels, dimensions)

2 Principal Components Bayes Classifier Performance:
  Training Output ->["'NO': 90", "'AB': 191"] Total: 281
  Testing Output -->["'NO': 11", "'AB': 18"] Total: 29
     Positive Class Label: 'NO'
         TrueP: 6
         FalseP: 5
         TrueN: 14
         FalseN: 4
         Accuracy: 0.6896551724137931
         Sensitivity: 0.6
         Specificity: 0.7368421052631579
         PPV: 0.5454545454545454
     Positive Class Label: 'AB'
         TrueP: 14
         FalseP: 4
         TrueN: 6
         FalseN: 5
         Accuracy: 0.6896551724137931
         Sensitivity: 0.7368421052631579
         Specificity: 0.6
         PPV: 0.7777777777777778


3 Principal Components Bayes Classifier Performance:
  Training Output ->["'NO': 90", "'AB': 191"] Total: 281
  Testing Output -->["'NO': 12", "'AB': 17"] Total: 29
     Positive Class Label: 'NO'
         TrueP: 7
         FalseP: 5
         TrueN: 14
         FalseN: 3
         Accuracy: 0.7241379310344828
         Sensitivity: 0.7
       

In [57]:
V3Classlabels = ['DH','SL','NO']
dimensions = 6
V2BayesPCAperformance = BayesPCAperformance(V3TestDataset, V3TrainDataset, V3Classlabels, dimensions)

2 Principal Components Bayes Classifier Performance:
  Training Output ->["'DH': 58", "'SL': 137", "'NO': 92"] Total: 287
  Testing Output -->["'DH': 4", "'SL': 12", "'NO': 7"] Total: 23
     Positive Class Label: 'DH'
         TrueP: 1
         FalseP: 3
         TrueN: 17
         FalseN: 2
         Accuracy: 0.782608695652174
         Sensitivity: 0.3333333333333333
         Specificity: 0.85
         PPV: 0.25
     Positive Class Label: 'SL'
         TrueP: 12
         FalseP: 0
         TrueN: 6
         FalseN: 5
         Accuracy: 0.782608695652174
         Sensitivity: 0.7058823529411765
         Specificity: 1.0
         PPV: 1.0
     Positive Class Label: 'NO'
         TrueP: 5
         FalseP: 2
         TrueN: 13
         FalseN: 3
         Accuracy: 0.782608695652174
         Sensitivity: 0.625
         Specificity: 0.8666666666666667
         PPV: 0.7142857142857143


3 Principal Components Bayes Classifier Performance:
  Training Output ->["'DH': 58", "'SL': 137", "'NO':