# General Functions

In [26]:
import os, struct
import matplotlib as plt
from array import array as pyarray
import numpy as np
import numpy.linalg as LA
import pandas as pd
from pylab import *
import scipy.sparse as sparse
import scipy.linalg as linalg
import random

def readExcelSheet1(excelfile):
    from pandas import read_excel
    return (read_excel(excelfile)).values

#This function is used in the function readExcel(...) defined further below
def readExcelRange(excelfile,sheetname="Sheet1",startrow=1,endrow=1,startcol=1,endcol=1):
    from pandas import read_excel
    values=(read_excel(excelfile, sheetname,header=None)).values;
    return values[startrow-1:endrow,startcol-1:endcol]

#This is the function you can actually use within your program.
#See manner of usage further below in the section "Prepare Data"

def readExcel(excelfile,**args):
    if args:
        data=readExcelRange(excelfile,**args)
    else:
        data=readExcelSheet1(excelfile)
    if data.shape==(1,1):
        return data[0,0]
    elif (data.shape)[0]==1:
        return data[0]
    else:
        return data

def writeExcelData(x,excelfile,sheetname,startrow,startcol):
    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook
    df=DataFrame(x)
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl') 
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df.to_excel(writer, sheet_name=sheetname,startrow=startrow-1, startcol=startcol-1, header=False, index=False)
    writer.save()
    writer.close()

def getSheetNames(excelfile):
    from pandas import ExcelFile
    return (ExcelFile(excelfile)).sheet_names
sheetname = 'Results'
startcol = 2
excelfile=r"/Volumes/Macintosh HD/Users/louisecabansay/Dropbox (Personal)/UBX - Machine Learning w: Python/FinalProject/KmeansResults.xlsx";


# Functions for Implementing Kmeans

In [27]:
#K-means functions
def cluster_points(X, mu):
    clusters  = {}
    for x in X:
        mukey = min([(i[0], np.linalg.norm(x[:-1]-mu[i[0]])) \
                    for i in enumerate(mu)], key=lambda t:t[1])
        bestmukey = mukey[0]
        try:
            clusters[bestmukey].append(x)
        except KeyError:
            clusters[bestmukey] = [x]
    return clusters
 
def reevaluate_centers(mu, clusters):
    newmu = []
    keys = sorted(clusters.keys())
    for k in keys:
        newmu.append(np.mean(np.array(clusters[k])[:,:-1], axis = 0))
    return newmu
 
def has_converged(mu, oldmu):
    return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))
 
def find_centers(X, K):
    # Initialize to K random centers
    # Set seed to maintain results
    random.seed(4564)
    oldmu = random.sample(X[:,:-1], K)
    mu = random.sample(X[:,:-1], K)
    while not has_converged(mu, oldmu):
        oldmu = mu
        # Assign all points in X to clusters
        clusters = cluster_points(X, mu)
        # Reevaluate centers
        mu = reevaluate_centers(oldmu, clusters)
    return (clusters)

def resultKmeansClassLabels(Dataset, k, classlabels):#cluster must have cluster label
    DataClusters = find_centers(Dataset, k)
    print 'Numer of Clusters = '+repr(k)
    print 'Class labels: '+ repr(classlabels)
    for i in range(k):
        Cluster = np.array(DataClusters[i])
        print 'Cluster'+repr(i+1)+':'
        for j in range(len(classlabels)):
            label = Cluster[Cluster[:,-1] == classlabels[j]]
            print repr(classlabels[j])+ ': '+ repr(len(label))
        print 'Total Cluster Size: '+ repr(len(Cluster))
        
    return np.array(DataClusters)


## Load data and subset to training and testing sets
**subsets not needed for Kmeans

In [28]:
#import 2C data
v2C=r"/Volumes/Macintosh HD/Users/louisecabansay/Dropbox (Personal)/UBX - Machine Learning w: Python/FinalProject/vertebral_2C.xlsx"
#import 3C data
v3C =r"/Volumes/Macintosh HD/Users/louisecabansay/Dropbox (Personal)/UBX - Machine Learning w: Python/FinalProject/vertebral_3C.xlsx"
#import matrices with class labels
vdata2C=np.array(readExcel(v2C))
vdata3C=np.array(readExcel(v3C))

print vdata2C.shape
print vdata3C.shape

(310, 7)
(310, 7)


### Run K-means on V2Class Data, k=2

In [33]:
V2Classlabels = ['AB','NO']
V2Clusters = resultKmeansClassLabels(vdata2C, 2, V2Classlabels)

Numer of Clusters = 2
Class labels: ['AB', 'NO']
Cluster1:
'AB': 102
'NO': 99
Total Cluster Size: 201
Cluster2:
'AB': 108
'NO': 1
Total Cluster Size: 109


### Run K-means on V3Class Data, k=3

In [30]:
V3Classlabels = ['DH','SL','NO']
V2Clusters = resultKmeansClassLabels(vdata3C, 3, V3Classlabels)

Numer of Clusters = 3
Class labels: ['DH', 'SL', 'NO']
Cluster1:
'DH': 0
'SL': 56
'NO': 0
Total Cluster Size: 56
Cluster2:
'DH': 58
'SL': 13
'NO': 89
Total Cluster Size: 160
Cluster3:
'DH': 2
'SL': 81
'NO': 11
Total Cluster Size: 94
