In [173]:
import pandas, numpy
from sklearn import svm
from sklearn.qda import QDA
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [174]:
results = pandas.read_csv('results-most.csv')

# drop some qctests
results = results.drop('CSIRO_gradient', 1)
results = results.drop('CoTeDe_anomaly_detection', 1)

# shuffle the dataframe
results.iloc[numpy.random.permutation(len(results))]

# slice up for svm
truth = results.ix[:,1].tolist()
qctests = results.ix[:,2:].values.tolist()

trainingSize = 1000

In [175]:
def performance(truth, qcresult):
    # check the performance of a single column qcresult

    TT = 0.
    FF = 0.
    TF = 0.
    FT = 0.
    T = 0.
    F = 0.
    
    for i in range(len(qcresult)):
        if truth[i]:
            T += 1.
        else:
            F += 1.
        
        if qcresult[i] and truth[i]:
            TT += 1.
        elif qcresult[i] and not truth[i]:
            TF += 1.
        elif not qcresult[i] and truth[i]:
            FT += 1.
        elif not qcresult[i] and not truth[i]:
            FF += 1.
    
    return (TT/T, FT/T, FF/F, TF/F)

In [176]:
for i in range(2,len(qctests[0])):
    print results.columns[i], performance(truth, results.ix[:,i].tolist())

Argo_global_range_check (0.1596244131455399, 0.8403755868544601, 1.0, 0.0)
Argo_gradient_test (0.27386541471048514, 0.7261345852895149, 0.9990945011524531, 0.0009054988475469213)
Argo_impossible_date_test (0.0, 1.0, 1.0, 0.0)
Argo_impossible_location_test (0.0, 1.0, 1.0, 0.0)
Argo_pressure_increasing_test (0.11580594679186229, 0.8841940532081377, 0.866480079025354, 0.13351992097464604)
Argo_regional_range_test (0.006259780907668232, 0.9937402190923318, 0.9999176819229503, 8.231807704972011e-05)
Argo_spike_test (0.0297339593114241, 0.9702660406885759, 1.0, 0.0)
CSIRO_depth (0.837245696400626, 0.162754303599374, 0.2642410273296016, 0.7357589726703985)
CSIRO_wire_break (0.2519561815336463, 0.7480438184663537, 1.0, 0.0)
CoTeDe_Argo_density_inversion (0.3489827856025039, 0.651017214397496, 0.5221435627263747, 0.4778564372736253)
CoTeDe_GTSPP_WOA_normbias (0.8497652582159625, 0.15023474178403756, 0.8016134343101745, 0.1983865656898255)
CoTeDe_GTSPP_global_range (0.16118935837245696, 0.838810

In [177]:
# create and train an svm

svm_classifier = svm.SVC(kernel='linear', class_weight = {0: 1, 1: 100})
svm_classifier.fit(qctests[0:trainingSize], truth[0:trainingSize])

SVC(C=1.0, cache_size=200, class_weight={0: 1, 1: 100}, coef0=0.0, degree=3,
  gamma=0.0, kernel='linear', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [178]:
# create and train a qda

qda_classifier = QDA()
qda_classifier.fit(qctests[0:trainingSize], truth[0:trainingSize])

QDA(priors=None, reg_param=0.0)

In [179]:
# create and train a kernel ridge

kr_classifier = KernelRidge(kernel='linear')
kr_classifier.fit(qctests[0:trainingSize], truth[0:trainingSize])

KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [180]:
# create and train a nearest centroid

nc_classifier = NearestCentroid()
nc_classifier.fit(qctests[0:trainingSize], truth[0:trainingSize])

NearestCentroid(metric='euclidean', shrink_threshold=None)

In [185]:
svm_prediction = svm_classifier.predict(qctests[trainingSize:])
qda_prediction = qda_classifier.predict(qctests[trainingSize:])
kr_prediction = kr_classifier.predict(qctests[trainingSize:])
nc_prediction = nc_classifier.predict(qctests[trainingSize:])
final_prediction = [ svm_prediction[i] or qda_prediction[i] for i in range(0,len(svm_prediction))]
performance(truth[trainingSize:], final_prediction)

(0.9036544850498339,
 0.09634551495016612,
 0.6796602592758159,
 0.3203397407241842)