-
Notifications
You must be signed in to change notification settings - Fork 0
/
FS_Select.py
96 lines (84 loc) · 4.61 KB
/
FS_Select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
http://sebastianraschka.com/Articles/2014_python_lda.html
Load the Enhanced Dataset, G = F U M, Expected to have size 88x(1004004+(1004004/2)) = 88x(1004004+502002) = 88x1506006
Feature Selection:
Step 1: Generate all possible subsets of indexes;
heuristics:
0. Limit around the reported length of subsets i.e. less than 1000 or even below.
1. use metagenes first
2. know which genes are used to generate a meta gene so we dont try that combination
3. also use biological information first
4. Try different other FS techniques here e.g. mRMR, Correlation as heuristics
Training DataSet:
1. Split the dataset of 88 patients into Training+Validation and Testing Dataset
2. Use LOOCV i.e. pick n-1 for training and 1 for validation
Step 2: Apply LDA to each subset, calculate the error rate and reliability factor for each subset
Step 3: Apply thining algo to setup the ensemble of classifiers
Step 4: Save Feature Selector Ensemble Configurations
"""
import numpy
from DataSetLoaderLib import DataSetLoader
from itertools import *
from Supervised_LDA import *
import random
#from Combinator import *
from GlobalUtils import *
def getNextCombination():
logWarning('HARDCODED COMBINATION');
return [1,4,7,34,2,6,8,900];
""" MAIN CODE FOR TREELET CLUSTERING """
def main():
"""
for each of the subsets s of indexes from 0-1004003 of length between 1 and 10
#use biological info of known genes and mutual information and top down level of tree nodes [top down means ok interpretation and vague idea of root cause. bottom up means poor interpretation but pin pointed root cause identification]
#check which subset is the best one by sorting them on desc order of error and then reliability
create d as vertical projection of dataset using s indexes only
for partition = 1 to length-2
create trainingSet of size partition
create testSet of size length-partition
calculate Error Rate & Reliability using CV10
calculate avg Error Rate and Avg Reliability
pick the best
"""
print("\n\n\n\n\n");
datasetLoader = DataSetLoader();
setSize = 3;
CVSetting = 2;
classLabels = [];
enhancedGeneSet = [];
classLabels.extend(datasetLoader.GetClassLabels("A"));
enhancedGeneSet.extend(datasetLoader.LoadEnhancedDataSet("A"));
enhancedGeneSet = np.array(enhancedGeneSet);
logInfo('Loaded the datasets');
for s in range(1, 1+setSize):
for i in range(0, np.array(enhancedGeneSet).shape[1]):
allCombinations = combinations(range(1,1+enhancedGeneSet.shape[1]-1),s);#TODO: go from 1 to setSize and for the selected top X from amongst one level, make sure the next level subset contains them as prefix so we
logInfo("allCombinations generated...")
for aCombination in allCombinations:
logDebug('aCombination');
logDebug(aCombination);
#on this combination, perform LOOCV (Leave one out cross validation)
tempDataSet = enhancedGeneSet[:, aCombination[:]];
logDebug ('temp Data Set ');
logDebug (tempDataSet.shape);
logInfo('going to partition the tempDataSet');
logDebug(tempDataSet.shape[0]);
for partition in range(CVSetting, 1+tempDataSet.shape[0]): #Using CV1
logDebug("Partition");
logDebug(partition);
trainingLabels = classLabels[0:partition];
trainingSet = tempDataSet[0:partition, :];
logDebug('training set');
logDebug(trainingSet.shape);
testSet = tempDataSet[partition:tempDataSet.shape[0], :];
testLabels = classLabels[partition:tempDataSet.shape[0]];
logDebug('test set');
logDebug(testSet.shape);
print (trainingLabels);
print (trainingSet);
classifier = Train(trainingSet, trainingLabels);
errorRate, reliability, jScore = Evaluate(classifier, tempDataSet, testSet, testLabels, 1);
print errorRate, ";", reliability, ";", jScore;
return;
if __name__=="__main__":
main();