In [14]:
import pandas as pd
from scipy.io import arff
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import os
import time
from tqdm.notebook import trange, tqdm

In [16]:
import warnings 
warnings.filterwarnings('ignore')

In [17]:
file_pwd = os.getcwd() + "\Data"
res = os.walk(file_pwd)
file_list = [i[2] for i in res][0]
file_list

['CM1.arff',
 'JM1.arff',
 'KC1.arff',
 'KC3.arff',
 'MC1.arff',
 'MC2.arff',
 'MW1.arff',
 'PC1.arff',
 'PC2.arff',
 'PC3.arff',
 'PC4.arff',
 'PC5.arff']

In [18]:
#输入data,输出log2D列的数据
def preprocess(df):
    head_list = df.columns.values.tolist()
    #标准化
    data_without_YN = df.drop("Defective",axis = 1)
    data_normalize = (data_without_YN-data_without_YN.mean())/(data_without_YN.std())
    data_normalize['Defective'] = df.Defective
    
    row_yes_data = df[df.Defective == b'Y']
    row_yes_data = row_yes_data.drop("Defective",axis = 1).values
    row_no_data =  df[df.Defective == b'N']
    row_no_data = row_no_data.drop("Defective",axis = 1).values
    
    yes_samples = data_normalize[data_normalize.Defective == b"Y"]
    yes_samples = yes_samples.drop("Defective",axis = 1)
    no_samples = data_normalize[data_normalize.Defective == b"N"]
    no_samples = no_samples.drop("Defective",axis = 1)
    k = len(no_samples)//len(yes_samples)
    
    yes_samples_array = yes_samples.values
    no_samples_array = no_samples.values
    array = [[np.sqrt(np.sum(np.square(x-y))) for y in no_samples_array]for x in yes_samples_array]
    array = np.array(array).argsort()[:,:k]
    w = {i:0 for i in range(yes_samples.shape[1])}

    for i in range(array.shape[0]):
        for j in array[i]:
            ds = np.abs(row_yes_data[i,:] - row_no_data[j,:])
            ds = pd.Series(ds).rank(method='min')
            for index in range(len(ds)):
                w[index] += ds[index]
    
    a = sorted(w.items(),key=lambda x:x[1],reverse=True)
    b = [i[0] for i in a ]
    c = np.array(head_list)
    column = list(c[b])
    df2 = df.loc[:,column].copy()
    
    d = df2.shape[1]
    log2d = math.ceil(math.log2(d))
    df2 = df2.iloc[:,:log2d]
    return df2

In [19]:
#返回十次十折交叉验证的平均auc
def SVM(data,label):
    clf = SVC(gamma='auto')
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [20]:
#贝叶斯分类
def NB(data,label):
    clf = MultinomialNB()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [21]:
#决策树分类
def DT(data,label):
    clf = DecisionTreeClassifier()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)


In [36]:
real_start = time.clock()
for each in tqdm(file_list):
    res_list = []
    data = arff.loadarff('./data/{}'.format(each))
    df = pd.DataFrame(data[0])
    if df.columns[-1] == "label":
        df.rename(columns={'label':'Defective'},inplace=True) 
    defective = df.Defective.copy()
    defective[defective==b'N'] = 0
    defective[defective==b'Y'] = 1

    #得到排好序的数据
    data = preprocess(df)
    head_list = data.columns
    
    for every_feature in tqdm(head_list):
        start = time.clock()
        X = data.loc[:,head_list[0]:every_feature]
        label = defective.astype(int)
        svm_auc = SVM(X.copy(),label)
        destree_auc = DT(X.copy(),label)
        nb_auc = NB(X.copy(),label)
        print("*"*20)
        print("数据尺寸:{}".format(X.shape))
        print("文件名:{}".format("CM1"))
        print("feature:{}:{}".format(head_list[0],every_feature))
        print("SVM--->{}:".format(svm_auc))
        print("决策树--->{}:".format(destree_auc))
        print("贝叶斯--->{}".format(nb_auc))
        spend = (time.clock()-start)
        print("use time:{}".format(spend))
        print("="*20)
        make_dic = {
            "size":X.shape,
            "feature":every_feature,
            "SVM":svm_auc,
            "tree":destree_auc,
            "nb":nb_auc
        }
        res_list.append(make_dic)
    print(res_list)

    info = {key:[]for key in res_list[0].keys()}
    for one in res_list:
        for key,value in one.items():
            info[key].append(value)
    info = pd.DataFrame(info)
    info.to_csv("{}.csv".format(each))
print("总共耗时:",(time.clock()-real_start))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=37), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 1)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_EFFORT
SVM--->0.5715940860215053:
决策树--->0.6030698924731184:
贝叶斯--->0.5
use time:4.678998700000193


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 2)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_PROG_TIME
SVM--->0.5818266129032258:
决策树--->0.6052204301075268:
贝叶斯--->0.3868225806451613
use time:4.443179099999725


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 3)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_VOLUME
SVM--->0.5454811827956989:
决策树--->0.55147311827957:
贝叶斯--->0.4737970430107527
use time:4.638903299999583


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 4)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_LENGTH
SVM--->0.5420094086021505:
决策树--->0.5230215053763441:
贝叶斯--->0.395497311827957
use time:4.65213279999989


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 5)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_OPERATORS
SVM--->0.5499153225806451:
决策树--->0.5103037634408603:
贝叶斯--->0.3794166666666667
use time:4.684788200000185


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 6)
文件名:CM1
feature:HALSTEAD_EFFORT:NUMBER_OF_LINES
SVM--->0.5556129032258064:
决策树--->0.5110188172043011:
贝叶斯--->0.3469690860215054
use time:5.460412599999472


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 7)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_OPERANDS
SVM--->0.5594193548387095:
决策树--->0.4838709677419356:
贝叶斯--->0.3796518817204301
use time:5.5346245999999155


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 8)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_CONTENT
SVM--->0.5599193548387096:
决策树--->0.5064569892473119:
贝叶斯--->0.37229301075268817
use time:4.23585009999988


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 9)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_TOTAL
SVM--->0.5616236559139784:
决策树--->0.5195591397849462:
贝叶斯--->0.39951881720430105
use time:4.866699299999709


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 10)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_COMMENTS
SVM--->0.5620913978494623:
决策树--->0.5572607526881721:
贝叶斯--->0.41699731182795696
use time:4.646312599999874


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 11)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_EXECUTABLE
SVM--->0.5630698924731182:
决策树--->0.5592795698924732:
贝叶斯--->0.42759543010752693
use time:5.243080699999155


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 12)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_UNIQUE_OPERANDS
SVM--->0.5674838709677419:
决策树--->0.5336263440860215:
贝叶斯--->0.4147768817204301
use time:4.7343511999997645


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 13)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_BLANK
SVM--->0.5672204301075268:
决策树--->0.5438091397849464:
贝叶斯--->0.41775672043010753
use time:4.9492356000000655


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 14)
文件名:CM1
feature:HALSTEAD_EFFORT:PERCENT_COMMENTS
SVM--->0.5555268817204302:
决策树--->0.5186209677419356:
贝叶斯--->0.40426209677419356
use time:4.026756000000205


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 15)
文件名:CM1
feature:HALSTEAD_EFFORT:EDGE_COUNT
SVM--->0.5586451612903225:
决策树--->0.5616720430107527:
贝叶斯--->0.3938279569892473
use time:4.971417300000212


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 16)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_DIFFICULTY
SVM--->0.5602526881720429:
决策树--->0.5373037634408602:
贝叶斯--->0.4394462365591398
use time:6.390108499999769


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 17)
文件名:CM1
feature:HALSTEAD_EFFORT:NODE_COUNT
SVM--->0.5596505376344086:
决策树--->0.5300483870967743:
贝叶斯--->0.43703897849462364
use time:3.824643500000093


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 18)
文件名:CM1
feature:HALSTEAD_EFFORT:CONDITION_COUNT
SVM--->0.5605591397849463:
决策树--->0.5276424731182796:
贝叶斯--->0.4328615591397849
use time:3.6418773000004876


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 19)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_UNIQUE_OPERATORS
SVM--->0.5612311827956988:
决策树--->0.5728682795698925:
贝叶斯--->0.4565618279569893
use time:4.222361600000113


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 20)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_CODE_AND_COMMENT
SVM--->0.5597956989247311:
决策树--->0.5824220430107527:
贝叶斯--->0.4687836021505377
use time:4.694853900000453


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 21)
文件名:CM1
feature:HALSTEAD_EFFORT:BRANCH_COUNT
SVM--->0.562268817204301:
决策树--->0.5836827956989248:
贝叶斯--->0.45629973118279576
use time:3.901716600000327


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 22)
文件名:CM1
feature:HALSTEAD_EFFORT:MULTIPLE_CONDITION_COUNT
SVM--->0.5678548387096773:
决策树--->0.5711989247311828:
贝叶斯--->0.45596774193548384
use time:3.8676532999998017


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 23)
文件名:CM1
feature:HALSTEAD_EFFORT:DECISION_COUNT
SVM--->0.5664516129032258:
决策树--->0.5854596774193548:
贝叶斯--->0.42581989247311836
use time:3.8001407000001564


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 24)
文件名:CM1
feature:HALSTEAD_EFFORT:CALL_PAIRS
SVM--->0.5562217741935482:
决策树--->0.5748924731182796:
贝叶斯--->0.4564596774193549
use time:4.158018799999809


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 25)
文件名:CM1
feature:HALSTEAD_EFFORT:CYCLOMATIC_COMPLEXITY
SVM--->0.5571720430107526:
决策树--->0.5799731182795699:
贝叶斯--->0.4472016129032258
use time:4.177073799999562


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 26)
文件名:CM1
feature:HALSTEAD_EFFORT:DESIGN_COMPLEXITY
SVM--->0.5551814516129031:
决策树--->0.5919327956989247:
贝叶斯--->0.44689919354838714
use time:3.843826600000284


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 27)
文件名:CM1
feature:HALSTEAD_EFFORT:MODIFIED_CONDITION_COUNT
SVM--->0.5612244623655913:
决策树--->0.5730483870967742:
贝叶斯--->0.4572620967741936
use time:4.187062399999377


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 28)
文件名:CM1
feature:HALSTEAD_EFFORT:PARAMETER_COUNT
SVM--->0.5587513440860215:
决策树--->0.5892768817204301:
贝叶斯--->0.47746370967741936
use time:4.260549500000707


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 29)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_ERROR_EST
SVM--->0.5604045698924731:
决策树--->0.5801935483870968:
贝叶斯--->0.4741182795698925
use time:5.456365199999709


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 30)
文件名:CM1
feature:HALSTEAD_EFFORT:CYCLOMATIC_DENSITY
SVM--->0.5571102150537633:
决策树--->0.5782177419354839:
贝叶斯--->0.4651908602150538
use time:5.084665100000166


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 31)
文件名:CM1
feature:HALSTEAD_EFFORT:MAINTENANCE_SEVERITY
SVM--->0.5640645161290321:
决策树--->0.5996639784946237:
贝叶斯--->0.4544018817204301
use time:4.676584199999525


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 32)
文件名:CM1
feature:HALSTEAD_EFFORT:NORMALIZED_CYLOMATIC_COMPLEXITY
SVM--->0.5645591397849461:
决策树--->0.5719354838709678:
贝叶斯--->0.454008064516129
use time:4.596061900000677


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 33)
文件名:CM1
feature:HALSTEAD_EFFORT:DESIGN_DENSITY
SVM--->0.5713360215053763:
决策树--->0.5858279569892473:
贝叶斯--->0.4350456989247312
use time:4.033516800000143


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 34)
文件名:CM1
feature:HALSTEAD_EFFORT:ESSENTIAL_COMPLEXITY
SVM--->0.5710362903225806:
决策树--->0.5589731182795699:
贝叶斯--->0.44969220430107526
use time:4.265424599999278


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 35)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_LEVEL
SVM--->0.5738387096774193:
决策树--->0.5515672043010753:
贝叶斯--->0.45949327956989244
use time:5.651568000000225


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 36)
文件名:CM1
feature:HALSTEAD_EFFORT:ESSENTIAL_DENSITY
SVM--->0.5772836021505376:
决策树--->0.5797069892473118:
贝叶斯--->0.4699825268817204
use time:5.377030800000284


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 37)
文件名:CM1
feature:HALSTEAD_EFFORT:DECISION_DENSITY
SVM--->0.5848494623655914:
决策树--->0.554483870967742:
贝叶斯--->0.46698790322580647
use time:4.37923580000006

[{'size': (344, 1), 'feature': 'HALSTEAD_EFFORT', 'SVM': 0.5715940860215053, 'tree': 0.6030698924731184, 'nb': 0.5}, {'size': (344, 2), 'feature': 'HALSTEAD_PROG_TIME', 'SVM': 0.5818266129032258, 'tree': 0.6052204301075268, 'nb': 0.3868225806451613}, {'size': (344, 3), 'feature': 'HALSTEAD_VOLUME', 'SVM': 0.5454811827956989, 'tree': 0.55147311827957, 'nb': 0.4737970430107527}, {'size': (344, 4), 'feature': 'HALSTEAD_LENGTH', 'SVM': 0.5420094086021505, 'tree': 0.5230215053763441, 'nb': 0.395497311827957}, {'size': (344, 5), 'feature': 'NUM_OPERATORS', 'SVM': 0.5499153225806451, 'tree': 0.5103037634408603, 'nb': 0.3794166666666667}, {'size': (344, 6), 'feature': 'NUMBER_OF_LINES', 'SVM': 0.5556129032258064, 'tree': 0.5110188172043011, 'nb': 0.3469690860215054}, {'size': (344, 7), 'feature': 'NUM_OP

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 1)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_EFFORT
SVM--->0.6624207296564786:
决策树--->0.5382278183714491:
贝叶斯--->0.5
use time:654.8166375000001


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 2)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_PROG_TIME
SVM--->0.6603639792282726:
决策树--->0.5367824456001488:
贝叶斯--->0.5051073951473004
use time:558.1368480000001


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 3)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_VOLUME
SVM--->0.6666604221367883:
决策树--->0.5561701196962587:
贝叶斯--->0.36657754796602554
use time:533.6731163000004


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 4)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_LENGTH
SVM--->0.6665233180995535:
决策树--->0.5630090831157872:
贝叶斯--->0.3700393051457129
use time:535.3116823


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 5)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_OPERATORS
SVM--->0.668511457554003:
决策树--->0.5627766327211832:
贝叶斯--->0.37093744339210777
use time:571.7236788


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 6)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_TOTAL
SVM--->0.6818610029915179:
决策树--->0.5803164952676692:
贝叶斯--->0.36945006488804594
use time:760.9629019000004


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 7)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_OPERANDS
SVM--->0.6806728171678371:
决策树--->0.5794926336046217:
贝叶斯--->0.37088697109384067
use time:761.9914984000006


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 8)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_CONTENT
SVM--->0.6824150023113346:
决策树--->0.5760106750082044:
贝叶斯--->0.37667929207591694
use time:785.8741030000001


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 9)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_EXECUTABLE
SVM--->0.684683731889547:
决策树--->0.576867768099272:
贝叶斯--->0.376642261749266
use time:826.1846818000013


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 10)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_UNIQUE_OPERANDS
SVM--->0.6872470924609394:
决策树--->0.5803886944265254:
贝叶斯--->0.37800836180388214
use time:841.0048803999998


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 11)
文件名:CM1
feature:HALSTEAD_EFFORT:HALSTEAD_DIFFICULTY
SVM--->0.6873320359278333:
决策树--->0.5793185534495494:
贝叶斯--->0.37860445157948236
use time:866.0563951000004


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 12)
文件名:CM1
feature:HALSTEAD_EFFORT:BRANCH_COUNT
SVM--->0.6878812245949407:
决策树--->0.580905053137373:
贝叶斯--->0.3784825077188878
use time:870.8615461999998


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 13)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_BLANK
SVM--->0.6879147396807033:
决策树--->0.588593629307457:
贝叶斯--->0.37798735552769963
use time:1158.8131246000012


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 14)
文件名:CM1
feature:HALSTEAD_EFFORT:NUM_UNIQUE_OPERATORS
SVM--->0.6895774628190924:
决策树--->0.5894946711610889:
贝叶斯--->0.3794362659960797
use time:1035.5713605


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 15)
文件名:CM1
feature:HALSTEAD_EFFORT:LOC_COMMENTS
SVM--->0.6900369149857565:
决策树--->0.5914623832985171:
贝叶斯--->0.37900125261529205
use time:943.8055820999998


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

KeyboardInterrupt: 

In [35]:
# real_start = time.clock()
# res_list = []
# for each in file_list:
#     data = arff.loadarff('./data/{}'.format(each))
#     df = pd.DataFrame(data[0])
#     if df.columns[-1] == "label":
#         df.rename(columns={'label':'Defective'},inplace=True) 
#     defective = df.Defective.copy()
#     defective[defective==b'N'] = 0
#     defective[defective==b'Y'] = 1
    
#     start = time.clock()
#     #分为数据和标签
#     data = preprocess(df)
#     label = defective.astype(int)
#     svm_auc = SVM(data,label)
#     destree_auc = DT(data,label)
#     nb_auc = NB(data,label)
#     print("*"*20)
#     print("数据尺寸:{}".format(data.shape))
#     print("文件名:{}".format(each))
#     print("log2D:{}".format(data.shape[1]))
#     print("SVM--->{}:".format(svm_auc))
#     print("决策树--->{}:".format(destree_auc))
#     print("贝叶斯--->{}".format(nb_auc))
#     spend = (time.clock()-start)
#     print("use time:{}".format(spend))
#     print("="*20)
#     make_dic = {
#         "size":data.shape,
#         "name":each,
#         "log2D":data.shape[1],
#         "SVM":svm_auc,
#         "tree":destree_auc,
#         "nb":nb_auc
#     }
#     res_list.append(make_dic)
# print("总共耗时:",(time.clock()-real_start))
# print(res_list)


In [11]:
# data = {key:[]for key in res_list[0].keys()}
# for one in res_list:
#     for key,value in one.items():
#         data[key].append(value)
# data.pop("size")
# data = pd.DataFrame(data,index=range(1,13))
# data.to_csv("log2D.csv")