In [12]:
import pandas as pd
from scipy.io import arff
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score,cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import os
import time
from tqdm.notebook import trange, tqdm

In [13]:
import warnings 
warnings.filterwarnings('ignore')

In [14]:
file_pwd = os.getcwd() + "\Data"
res = os.walk(file_pwd)
file_list = [i[2] for i in res][0]
file_list

['CM1.arff',
 'JM1.arff',
 'KC1.arff',
 'KC3.arff',
 'MC1.arff',
 'MC2.arff',
 'MW1.arff',
 'PC1.arff',
 'PC2.arff',
 'PC3.arff',
 'PC4.arff',
 'PC5.arff']

In [15]:
#返回十次十折交叉验证的平均auc
def SVM(data,label):
    clf = SVC(gamma='auto')
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [16]:
#贝叶斯分类
def NB(data,label):
    clf = MultinomialNB()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [17]:
#决策树分类
def DT(data,label):
    clf = DecisionTreeClassifier()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)


In [18]:
real_start = time.clock()
res_list = []
for each in file_list:
    data = arff.loadarff('./data/{}'.format(each))
    df = pd.DataFrame(data[0])
    if df.columns[-1] == "label":
        df.rename(columns={'label': 'Defective'}, inplace=True)
    defective = df.Defective.copy()
    defective[defective == b'N'] = 0
    defective[defective == b'Y'] = 1

    start = time.clock()
    # #分为数据和标签
    label = defective.astype(int)
    data = df.iloc[:, :-1]

    svm_auc = SVM(data.copy(), label)
    destree_auc = DT(data.copy(), label)
    nb_auc = NB(data.copy(), label)
    print("*" * 20)
    print("数据尺寸:{}".format(data.shape))
    print("文件名:{}".format(each))
    print("SVM--->{}:".format(svm_auc))
    print("决策树--->{}:".format(destree_auc))
    print("贝叶斯--->{}".format(nb_auc))
    spend = (time.clock() - start)
    print("use time:{}".format(spend))
    print("=" * 20)
    make_dic = {
        "size": data.shape,
        "name":each,
        "SVM": svm_auc,
        "tree": destree_auc,
        "nb": nb_auc
    }
    res_list.append(make_dic)
print("总共耗时:", (time.clock() - real_start))
print(res_list)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 37)
文件名:CM1.arff
SVM--->0.5864274193548387:
决策树--->0.5657876344086022:
贝叶斯--->0.44510080645161293
use time:4.759242799999811


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 21)
文件名:JM1.arff
SVM--->0.6903996352548769:
决策树--->0.5949248742094146:
贝叶斯--->0.3791161399818766
use time:1145.7284157999998


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(2096, 21)
文件名:KC1.arff
SVM--->0.8114974089831855:
决策树--->0.6210018585324777:
贝叶斯--->0.44940302015585293
use time:33.40991310000027


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(200, 39)
文件名:KC3.arff
SVM--->0.5381112132352941:
决策树--->0.6256066176470588:
贝叶斯--->0.4711917892156863
use time:3.0980022000003373


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9277, 38)
文件名:MC1.arff
SVM--->0.9171829506421366:
决策树--->0.8045451827948288:
贝叶斯--->0.3041561729362944
use time:283.0249772999996


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(127, 39)
文件名:MC2.arff
SVM--->0.5342777777777777:
决策树--->0.6195416666666667:
贝叶斯--->0.5764201388888889
use time:2.5601404000008188


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(264, 37)
文件名:MW1.arff
SVM--->0.523052536231884:
决策树--->0.589453502415459:
贝叶斯--->0.4882820048309179
use time:3.3880380000000514


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(759, 37)
文件名:PC1.arff
SVM--->0.6272077541161392:
决策树--->0.6252457359755497:
贝叶斯--->0.34929791481810113
use time:10.601287499999671


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(1585, 36)
文件名:PC2.arff
SVM--->0.8965047770700636:
决策树--->0.5322111301649518:
贝叶斯--->0.4852966274701943
use time:17.9797451000004


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(1125, 37)
文件名:PC3.arff
SVM--->0.7005988455988457:
决策树--->0.6468022425420384:
贝叶斯--->0.49431435344700647
use time:21.839447199999995


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(1399, 37)
文件名:PC4.arff
SVM--->0.6732729130569994:
决策树--->0.7281487851482501:
贝叶斯--->0.48318126453122445
use time:32.1379567999993


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(17001, 38)
文件名:PC5.arff
SVM--->0.9700597660902537:
决策树--->0.7348987016679838:
贝叶斯--->0.20430969628363688
use time:452.0760570000002
总共耗时: 2013.5443666999995
[{'size': (344, 37), 'name': 'CM1.arff', 'SVM': 0.5864274193548387, 'tree': 0.5657876344086022, 'nb': 0.44510080645161293}, {'size': (9593, 21), 'name': 'JM1.arff', 'SVM': 0.6903996352548769, 'tree': 0.5949248742094146, 'nb': 0.3791161399818766}, {'size': (2096, 21), 'name': 'KC1.arff', 'SVM': 0.8114974089831855, 'tree': 0.6210018585324777, 'nb': 0.44940302015585293}, {'size': (200, 39), 'name': 'KC3.arff', 'SVM': 0.5381112132352941, 'tree': 0.6256066176470588, 'nb': 0.4711917892156863}, {'size': (9277, 38), 'name': 'MC1.arff', 'SVM': 0.9171829506421366, 'tree': 0.8045451827948288, 'nb': 0.3041561729362944}, {'size': (127, 39), 'name': 'MC2.arff', 'SVM': 0.5342777777777777, 'tree': 0.6195416666666667, 'nb': 0.5764201388888889}, {'size': (264, 37), 'name': 'MW1.arff', 'SVM': 0.523052536231884, 'tree': 0.5

In [19]:
data = {key:[]for key in res_list[0].keys()}
for one in res_list:
    for key,value in one.items():
        data[key].append(value)
data.pop("size")
data = pd.DataFrame(data,index=range(1,13))
data.to_csv("all_feature.csv")