In [1]:
import SequenceSegmentsImport as ss
import numpy as np
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
import collections
import operator


In [2]:
def naieveBayes(fileName):
	# setup the data
	dataSet = ss.importSeqSegements(fileName)
	sequenceSegmentList, featuresNames, seqSegFeaturesValues, classLabel = ss.getAverageMatix(dataSet)
	seqSegFeaturesDiscreteVals = ss.descretizeAvgMatrix(seqSegFeaturesValues)
	classLabel = np.array(classLabel)
	
	# get indices for stratified-k-fold cross validation
	skf = StratifiedKFold(classLabel, n_folds=5)
    # calculate accuracy for each of the folds
	roc_auc = []
	for train_index, test_index in skf:
		
		# train on 4 folds and test on 1 remaining fold
		X_train, X_test = seqSegFeaturesDiscreteVals[train_index], seqSegFeaturesDiscreteVals[test_index]
		y_train, y_test = classLabel[train_index], classLabel[test_index]
			
		# fit a Naive Bayes model to the data
		model = GaussianNB()
		model.fit(X_train, y_train)
		
		# make predictions
		expected = y_test
		predicted = model.predict(X_test)
				
		# summarize the fit of the model
		# print(metrics.classification_report(expected, predicted))
		# confusionMatrix.append(metrics.confusion_matrix(expected, predicted))
        
		# generate the ROCcurve
		false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, predicted)
        # calculate the area under ROCcurve
		roc_auc.append(metrics.auc(false_positive_rate, true_positive_rate))
	# final accuracy is the avrage accuracy of five folds	
	rocAucAvg = np.mean(roc_auc)
	
	return rocAucAvg	

In [3]:
def featureSelection(seqSegFeaturesDiscreteVals,classLabel):
    # wrapper forward selection
    (row, col) = seqSegFeaturesDiscreteVals.shape
    selectedFeatures = []
    acc = 0
    continuing = True
    # termination condition
    while (continuing):
        # initialize the data matrix with the data associated with features already selected
        data = np.zeros(shape=(row, col))
        for i in range(len(selectedFeatures)):
            for j in range(row):
                data[j,i] = seqSegFeaturesDiscreteVals[j,selectedFeatures[i]] 
        # delete all empty columns, save one for adding a new feature data values in future        
        emptyCols = range(len(selectedFeatures)+1,col)
        data = np.delete(data, emptyCols, 1)
        # select the next feature  
        accList = {}
        for i in range(col):
            if (i in selectedFeatures):
                # do nothing, the feature is already selected   
                continue
            else:
                # add a new column to data
                newColID = len(selectedFeatures)
                tempData = data
                for rowID in range(row):
                    tempData[rowID,newColID] = seqSegFeaturesDiscreteVals[rowID,i]
                # train a NB model on the current data
                model = GaussianNB()
                model.fit(tempData, classLabel)
                predicted = model.predict(tempData)
                confusionMatrix = metrics.confusion_matrix(classLabel, predicted)
                # compute the accuracy 
                currentAccuracy = float(confusionMatrix[0,0] + confusionMatrix[1,1])/float(np.sum(confusionMatrix))
                # store the accuracy resulted by trying each feature
                accList.update({i:currentAccuracy})   
                # identify what feature will generate the highest accuracy
        sorted_accList = sorted(accList.items(), key=operator.itemgetter(1))
        l = len(sorted_accList)
        t = sorted_accList.pop(l-1)
        maxAccuracy = t[1] 
        # check the termination condition
        if (maxAccuracy <= acc):
            continuing = False
        else:
            acc = maxAccuracy
            selectedFeatures.append(t[0])
            continuing = True
    # return the selected features        
    return selectedFeatures      
        
            

In [4]:
def FeatureSelectionNaiveBayse(fileName):
    
    # setup the data
    dataSet = ss.importSeqSegements(fileName)
    sequenceSegmentList, featuresNames, seqSegFeaturesValues, classLabel = ss.getAverageMatix(dataSet)
    seqSegFeaturesDiscreteVals = ss.descretizeAvgMatrix(seqSegFeaturesValues)
    classLabel = np.array(classLabel)
    raw_input()
    
    # feature selection
    selectedFeatures = featureSelection(seqSegFeaturesDiscreteVals,classLabel)
    
    # extract the data associated with the selected features
    (row, col) = seqSegFeaturesDiscreteVals.shape
    selectedFeatures_data = np.zeros(shape=(row, len(selectedFeatures)))
    for i in range(len(selectedFeatures)):
        for j in range(row):
            selectedFeatures_data[j,i] = seqSegFeaturesDiscreteVals[j,selectedFeatures[i]] 
            
    # get indices for stratified-k-fold cross validation
    skf = StratifiedKFold(classLabel, n_folds=5)
    
    # calculate accuracy for each of the folds
    roc_auc = []
    for train_index, test_index in skf:
        # train on 4 folds and test on 1 remaining fold
        X_train, X_test = selectedFeatures_data[train_index], selectedFeatures_data[test_index]
        y_train, y_test = classLabel[train_index], classLabel[test_index]
        # fit a Naive Bayes model to the data
        model = GaussianNB()
        model.fit(X_train, y_train)
        # make predictions
        expected = y_test
        predicted = model.predict(X_test)
        # summarize the fit of the model
        # print(metrics.classification_report(expected, predicted))
        # confusionMatrix.append(metrics.confusion_matrix(expected, predicted))
        # generate the ROCcurve
        false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, predicted)
        # calculate the area under ROCcurve
        roc_auc.append(metrics.auc(false_positive_rate, true_positive_rate))
        
    # final accuracy is the avrage accuracy of five folds
    rocAucAvg = np.mean(roc_auc)
    return rocAucAvg
    

In [5]:
def main():
#     NB_rocAucAvgList = []
#     for i in range(1,7):
#         fileName = "Set"+ str(i) + ".txt"
#         NB_rocAucAvgList.append(naieveBayes(fileName))
#     print NB_rocAucAvgList
    
    FSNB_rocAucAvgList = []
    for i in range(1,7):
        fileName = "Set"+ str(i) + ".txt"
        FSNB_rocAucAvgList.append(FeatureSelectionNaiveBayse(fileName))
    print FSNB_rocAucAvgList
    

In [None]:
if __name__ == '__main__':
    main()




