In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.stats as stats
from sklearn import svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
% matplotlib inline

In [2]:
data = np.genfromtxt('breast_cancer_wisconsin_dataset.txt', dtype ='float', delimiter=",")
print np.shape(data)

#  The wisconsin breast cancer data set is from the UCI ML repository
#  It's 9 features are as follows:
#  Clump Thickness, Uniformity of Cell Size, Uniformity of Cell Shape,
#  Marginal Adhesion, Single Epithelial Cell Size, Bare Nuclei, 
#  Bland Chromatin, Normal Nucleoli, and Mitoses.
#  Each is an integer between 1 and 10.
#  The label is whether the tumor is malignant or benign.
# https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names

X = data[:,1:10].reshape([699, 9])
Y = data[:,10].reshape([699, 1])
Y = np.where(Y == 2 , 1, 0)
posLabel = '2: benign'
negLabel = '4: malignant'
dataset_name = 'breast_cancer_wisconsin_dataset'


(699, 11)


In [3]:
data = np.genfromtxt('wine.txt', dtype ='float', delimiter=",")
print np.shape(data)
np.set_printoptions(suppress=True)

#      "These data are the results of a chemical analysis of
#      wines grown in the same region in Italy but derived from three
#      different cultivars.
#      The analysis determined the quantities of 13 constituents
#      found in each of the three types of wines." - UCI repository
#      The features are as follows: 
#      Alcohol, Malic acid, Ash, Alcalinity of ash, Magnesium
#      Total phenols, Flavanoids, Nonflavanoid phenols, Proanthocyanins
#      Color intensity, Hue, OD280/OD315 of diluted wines, Proline 
#      https://archive.ics.uci.edu/ml/datasets/wine

X = data[:,1:].reshape([178, 13])
Y = data[:,0].reshape([178, 1])

Y = np.where(Y == 1 , 1, 0)
posLabel = '1: location1'
negLabel = '2 & 3: location2 and location3'
dataset_name = 'wine_location'

print np.shape(X)
print np.shape(Y)

(178, 14)
(178, 13)
(178, 1)


In [4]:
data = np.genfromtxt('winequality_white_tab.txt', dtype ='float', delimiter=";")
print np.shape(data)

# "[This dataset contains] white vinho verde wine samples,
# from the north of Portugal. The goal is to model wine quality based
# on physicochemical tests (see [Cortez et al., 2009])" - UCI repository
# It's 11 features are as follows:
# fixed acidity, volatile acidity, citric acid, residual sugar, chlorides
# free sulfur dioxide, total sulfur dioxide, density
# pH, sulphates, alcohol.
# http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names

X = data[:,0:11]#.reshape([4898, 11])
Y = data[:,11].reshape([4898, 1])
Y = np.where(Y >=5 , 1, 0)
posLabel = 'high quality'
negLabel = 'low quality'
dataset_name = 'winequality_white'

# Quality labels scale from 0 to 10
np.array([1,10,20,50,100,150,200,250,300])
print np.shape(X)
print np.shape(Y)

(4898, 12)
(4898, 11)
(4898, 1)


In [6]:
col_mean = stats.nanmean(X,axis=0)
inds = np.where(np.isnan(X))
X[inds]=np.take(col_mean,inds[1])

Linear SVM on full training set

In [143]:
C_list = np.array([1, 10, 20])
#C_list = np.array([150,200,250,300])
#C_list = np.array([350,400,450,500])
C_error_list = np.zeros((len(C_list),3))
C_error_list[:,0] = C_list

for diter, C in enumerate(C_list):

    kfold = 5

    error = np.zeros((1,kfold))

    number_of_error_Calculations = 1
    errorCalcAverage = np.zeros((1,number_of_error_Calculations))
    errorCalcVar = np.zeros((1,number_of_error_Calculations))

    for m in range(number_of_error_Calculations):

        full_training_dataset_with_labels = np.hstack((X,Y))
        np.random.shuffle(full_training_dataset_with_labels)
        CV_set = full_training_dataset_with_labels

        for i in range(kfold):

            datasetDimensions = np.shape(X)
            datasetLength = datasetDimensions[0]
            datasetWidth = datasetDimensions[1]

            batchSize = datasetLength/kfold
            index1 = int(batchSize*i)
            index2 = int(batchSize*(i+1)) # check this. shouldn't it be (int(batchSize*i)+1)?

            if i == max(range(kfold)):
                CV_set_train = np.delete(CV_set, np.s_[index1:] ,0)
                CV_set_test = CV_set[index1:,:]

            else: 

                CV_set_train = np.delete(CV_set, np.s_[index1:index2] ,0)
                CV_set_test = CV_set[index1:index2,:]


            CV_X = CV_set_train[:,0:datasetWidth]
            CV_y = CV_set_train[:,datasetWidth]

            #C = 1

            CV_X_test =  CV_set_test[:,0:datasetWidth]
            CV_y_test =  CV_set_test[:,datasetWidth]

            clf = svm.SVC(C=C, kernel='linear')
            clf.fit(CV_X, CV_y)  

            Fx = clf.predict(CV_X_test)

            error[0,i] = (1.0/float(len(CV_y_test)))*np.sum(np.where(Fx == CV_y_test, 0, 1))

        avgCVError = (1.0/float(kfold))*np.sum(error)

        errorCalcAverage[0,m] = avgCVError

    errorCalcAverageTot = (1.0/float(number_of_error_Calculations))*np.sum(errorCalcAverage)

    
    C_error_list[diter,1] = errorCalcAverageTot
    C_error_list[diter,2] = 1 - errorCalcAverageTot
    
print "Dataset:", dataset_name
print "This is SVC with linear kernel and kfold CV with k =", kfold#, "and C =", C 
print "The positive label is", posLabel, "and the negative label is", negLabel   
print "The number of error calculations per parameter set is", number_of_error_Calculations 
print "The first column is C"
print "The second column is the average validation error"
print "The third column is the average validation accuracy"
np.set_printoptions(suppress=True)
print np.around(C_error_list,3)

Dataset: winequality_white
This is SVC with linear kernel and kfold CV with k = 5
The positive label is high quality and the negative label is low quality
The number of error calculations per parameter set is 1
The first column is C
The second column is the average validation error
The third column is the average validation accuracy
[[  1.      0.037   0.963]
 [ 10.      0.037   0.963]
 [ 20.      0.037   0.963]]


Radial Basis Function (RBF) SVM on full training set. Selecting parameters C and gamma

In [8]:
C_list = np.array([1,10])#,20,50])#,100,150,200,250,300])

#gamma_list = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
gamma_list = np.array([0.8, 0.9, 1, 1.1])
#gamma_list = np.array([1,10,20,50,100,150,200,250,300])
#gamma_list = np.array([0.01,0.02,0.03,0.04,0.05,0.06,0.07]) #,0.08,0.09])

C_error_list = np.zeros(((len(C_list)+1),(len(gamma_list)+1)))
C_error_list[0,1:] = gamma_list
C_error_list[1:,0] = C_list

for gamma_iter, gamma in enumerate(gamma_list):

    for riter, C in enumerate(C_list):

        kfold = 5

        error = np.zeros((1,kfold))

        number_of_error_Calculations = 5
        errorCalcAverage = np.zeros((1,number_of_error_Calculations))
        errorCalcVar = np.zeros((1,number_of_error_Calculations))

        for m in range(number_of_error_Calculations):

            full_training_dataset_with_labels = np.hstack((X,Y))
            np.random.shuffle(full_training_dataset_with_labels)
            CV_set = full_training_dataset_with_labels

            for i in range(kfold):

                datasetDimensions = np.shape(X)
                datasetLength = datasetDimensions[0]
                datasetWidth = datasetDimensions[1]

                batchSize = datasetLength/kfold
                index1 = int(batchSize*i)
                index2 = int(batchSize*(i+1))

                if i == max(range(kfold)):
                    CV_set_train = np.delete(CV_set, np.s_[index1:] ,0)
                    CV_set_test = CV_set[index1:,:]

                else: 

                    CV_set_train = np.delete(CV_set, np.s_[index1:index2] ,0)
                    CV_set_test = CV_set[index1:index2,:]


                CV_X = CV_set_train[:,0:datasetWidth]
                CV_y = CV_set_train[:,datasetWidth]

                CV_X_test =  CV_set_test[:,0:datasetWidth]
                CV_y_test =  CV_set_test[:,datasetWidth]

                #C = 2
                #gamma = 0.1

                clf = svm.SVC(C=C, kernel='rbf', gamma = gamma)
                clf.fit(CV_X, CV_y)  

                Fx = clf.predict(CV_X_test)

                error[0,i] = (1.0/float(len(CV_y_test)))*np.sum(np.where(Fx == CV_y_test, 0, 1))

            avgCVError = (1.0/float(kfold))*np.sum(error)

            varCVError = (1.0/float(kfold))*np.sum(np.square((error-avgCVError)))

            errorCalcAverage[0,m] = avgCVError

        errorCalcAverageTot = (1.0/float(number_of_error_Calculations))*np.sum(errorCalcAverage)

        C_error_list[(riter+1), (gamma_iter+1)] = errorCalcAverageTot
    
print "Dataset:", dataset_name
print "This is SVC with rbf kernel and kfold CV with k =", kfold#, "and gamma =", gamma, "."
print "The positive label is", posLabel, "and the negative label is", negLabel 
print "The number of error calculations per parameter set is", number_of_error_Calculations  
print "The left column is C and the top row is right column is the average error for that C"
np.set_printoptions(suppress=True)
parameter_output=np.around(C_error_list,3)
print parameter_output

Dataset: winequality_white
This is SVC with rbf kernel and kfold CV with k = 5
The positive label is high quality and the negative label is low quality
The number of error calculations per parameter set is 5
The left column is C and the top row is right column is the average error for that C
[[  0.      0.8     0.9     1.      1.1  ]
 [  1.      0.034   0.035   0.034   0.034]
 [ 10.      0.034   0.034   0.034   0.034]]


Part d)

Playing with the different sizes of training and testing data to see how they effect the classification results. 

In [8]:
ratio_list = np.array([0.6,0.7,0.8])#0.1,0.2,0.3,0.4,0.5,0.9,0.95
testing_error_for_ratios = np.zeros((len(ratio_list),3))
testing_error_for_ratios[:,0] = ratio_list 

C = 1
gamma = 0.7
kernel = 'linear'

for q_iter, trainingRatio in enumerate(ratio_list):

    testingRatio = 1.00 - trainingRatio


    number_of_error_Calculations = 5

    error = np.zeros((1,number_of_error_Calculations))

    errorCalcAverage = np.zeros((1,number_of_error_Calculations))
    errorCalcVar = np.zeros((1,number_of_error_Calculations))


    for i in range(number_of_error_Calculations):

        dataset_with_labels = np.hstack((X,Y))
        np.random.shuffle(dataset_with_labels)

        datasetDimensions = np.shape(X)
        datasetLength = datasetDimensions[0]
        datasetWidth = datasetDimensions[1]

        trainingSetLength = int(datasetLength*trainingRatio)
        trainingSetWidth = datasetWidth 

        Xy_train = dataset_with_labels[0:trainingSetLength,:]
        Xy_test = dataset_with_labels[trainingSetLength:,:]

        X_train = Xy_train[:,0:trainingSetWidth]
        y_train = Xy_train[:,trainingSetWidth]

        X_test = Xy_test[:,0:trainingSetWidth]
        y_test = Xy_test[:,trainingSetWidth]

        C = C
        gamma = gamma
        #kernel='linear'

        clf = svm.SVC(C=C, kernel=kernel) #, gamma = gamma)
        clf.fit(X_train, y_train)  

        Fx = clf.predict(X_test)

        error[0,i] = (1.0/float(len(y_test)))*np.sum(np.where(Fx == y_test, 0, 1))

    average_of_errors = (1.0/float(number_of_error_Calculations) )* np.sum(error)
    
    testing_error_for_ratios[q_iter,1] = average_of_errors
    testing_error_for_ratios[q_iter,2] = 1 - average_of_errors
    
print "Dataset:", dataset_name
print "This is SVC with", kernel,"kernel"#," and with training/testing ratio: ", trainingRatio, testingRatio
print "The positive label is", posLabel, "and the negative label is", negLabel 
print "C =", C #, " gamma =", gamma
print "The number of error calculations per parameter set is", number_of_error_Calculations 
print "The training/testing ratio is the first column"
print "The testing error average is the second column"
print "The testing accuracy average is the third column"
print np.around(testing_error_for_ratios,3)

Dataset: winequality_white
This is SVC with linear kernel
The positive label is high quality and the negative label is low quality
C = 1
The number of error calculations per parameter set is 5
The training/testing ratio is the first column
The testing error average is the second column
The testing accuracy average is the third column
[[ 0.6    0.034  0.966]
 [ 0.7    0.042  0.958]
 [ 0.8    0.036  0.964]]
