In [1]:
import pandas as pd
from scipy.io import arff
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score,cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import os
import time
from tqdm.notebook import trange, tqdm

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
file_pwd = os.getcwd() + "\Data"
res = os.walk(file_pwd)
file_list = [i[2] for i in res][0]
file_list

['CM1.arff',
 'JM1.arff',
 'KC1.arff',
 'KC3.arff',
 'MC1.arff',
 'MC2.arff',
 'MW1.arff',
 'PC1.arff',
 'PC2.arff',
 'PC3.arff',
 'PC4.arff',
 'PC5.arff']

In [4]:
#输入data,输出log2D列的数据,不带标签的
def preprocess(df):
    head_list = df.columns.values.tolist()
    #标准化
    data_without_YN = df.drop("Defective",axis = 1)
    data_normalize = (data_without_YN-data_without_YN.mean())/(data_without_YN.std())
    data_normalize['Defective'] = df.Defective
    
    row_yes_data = df[df.Defective == b'Y']
    row_yes_data = row_yes_data.drop("Defective",axis = 1).values
    row_no_data =  df[df.Defective == b'N']
    row_no_data = row_no_data.drop("Defective",axis = 1).values
    
    yes_samples = data_normalize[data_normalize.Defective == b"Y"]
    yes_samples = yes_samples.drop("Defective",axis = 1)
    no_samples = data_normalize[data_normalize.Defective == b"N"]
    no_samples = no_samples.drop("Defective",axis = 1)
    k = len(no_samples)//len(yes_samples)
    
    yes_samples_array = yes_samples.values
    no_samples_array = no_samples.values
    array = [[np.sqrt(np.sum(np.square(x-y))) for y in no_samples_array]for x in yes_samples_array]
    array = np.array(array).argsort()[:,:k]
    w = {i:0 for i in range(yes_samples.shape[1])}

    for i in range(array.shape[0]):
        for j in array[i]:
            ds = np.abs(row_yes_data[i,:] - row_no_data[j,:])
            ds = pd.Series(ds).rank(method='min')
            for index in range(len(ds)):
                w[index] += ds[index]
    
    a = sorted(w.items(),key=lambda x:x[1],reverse=True)
    b = [i[0] for i in a ]
    c = np.array(head_list)
    column = list(c[b])
    df2 = df.loc[:,column].copy()
    
    d = df2.shape[1]
    log2d = math.ceil(math.log2(d))
    df2 = df2.iloc[:,:log2d]
    return df2

In [5]:
#返回十次十折交叉验证的平均auc
def SVM(data,label):
    clf = SVC(gamma='auto')
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [6]:
#贝叶斯分类
def NB(data,label):
    clf = MultinomialNB()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [7]:
#决策树分类
def DT(data,label):
    clf = DecisionTreeClassifier()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)


In [8]:
real_start = time.clock()
res_list = []
for each in file_list:
    data = arff.loadarff('./data/{}'.format(each))
    df = pd.DataFrame(data[0])
    if df.columns[-1] == "label":
        df.rename(columns={'label':'Defective'},inplace=True) 
    defective = df.Defective.copy()
    defective[defective==b'N'] = 0
    defective[defective==b'Y'] = 1
    
    start = time.clock()
    #分为数据和标签
    data = preprocess(df)
    label = defective.astype(int)
    #带入训练
    svm_auc = SVM(data.copy(),label)
    destree_auc = DT(data.copy(),label)
    nb_auc = NB(data.copy(),label)
    print("*"*20)
    print("数据尺寸:{}".format(data.shape))
    print("文件名:{}".format(each))
    print("log2D:{}".format(data.shape[1]))
    print("SVM--->{}:".format(svm_auc))
    print("决策树--->{}:".format(destree_auc))
    print("贝叶斯--->{}".format(nb_auc))
    spend = (time.clock()-start)
    print("use time:{}".format(spend))
    print("="*20)
    make_dic = {
        "size":data.shape,
        "name":each,
        "log2D":data.shape[1],
        "SVM":svm_auc,
        "tree":destree_auc,
        "nb":nb_auc
    }
    res_list.append(make_dic)
print("总共耗时:",(time.clock()-real_start))
print(res_list)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(344, 6)
文件名:CM1.arff
log2D:6
SVM--->0.5576451612903226:
决策树--->0.4961478494623656:
贝叶斯--->0.38238844086021506
use time:3.6591262999999996


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9593, 5)
文件名:JM1.arff
log2D:5
SVM--->0.6675134583042334:
决策树--->0.5662707816031978:
贝叶斯--->0.3712714770120866
use time:799.9691903


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(2096, 5)
文件名:KC1.arff
log2D:5
SVM--->0.7981115664115925:
决策树--->0.5271977737165022:
贝叶斯--->0.4407297194113099
use time:31.112286499999982


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(200, 6)
文件名:KC3.arff
log2D:6
SVM--->0.5157904411764707:
决策树--->0.5330989583333333:
贝叶斯--->0.47273897058823533
use time:2.6175397999999177


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(9277, 6)
文件名:MC1.arff
log2D:6
SVM--->0.8804181680390609:
决策树--->0.8112091376901513:
贝叶斯--->0.6709705630694425
use time:147.92433070000004


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(127, 6)
文件名:MC2.arff
log2D:6
SVM--->0.5338194444444444:
决策树--->0.5520694444444445:
贝叶斯--->0.6011180555555556
use time:2.5809107999999696


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(264, 6)
文件名:MW1.arff
log2D:6
SVM--->0.5131853864734299:
决策树--->0.6145833333333334:
贝叶斯--->0.4178109903381643
use time:3.2987233999999717


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(759, 6)
文件名:PC1.arff
log2D:6
SVM--->0.560062629399586:
决策树--->0.567280045351474:
贝叶斯--->0.31155237602287295
use time:9.505013900000108


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(1585, 6)
文件名:PC2.arff
log2D:6
SVM--->0.8837767434264251:
决策树--->0.5144543116119549:
贝叶斯--->0.3258476237138658
use time:14.621963599999958


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(1125, 6)
文件名:PC3.arff
log2D:6
SVM--->0.6306164413817477:
决策树--->0.6037578039873959:
贝叶斯--->0.5068995715169183
use time:18.335934199999997


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(1399, 6)
文件名:PC4.arff
log2D:6
SVM--->0.6245920618680632:
决策树--->0.5579236889122348:
贝叶斯--->0.4201077412172386
use time:36.83919190000006


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


********************
数据尺寸:(17001, 6)
文件名:PC5.arff
log2D:6
SVM--->0.9510261653159858:
决策树--->0.6133271620143215:
贝叶斯--->0.19612663173885608
use time:476.00137889999996
总共耗时: 1549.4492114
[{'size': (344, 6), 'name': 'CM1.arff', 'log2D': 6, 'SVM': 0.5576451612903226, 'tree': 0.4961478494623656, 'nb': 0.38238844086021506}, {'size': (9593, 5), 'name': 'JM1.arff', 'log2D': 5, 'SVM': 0.6675134583042334, 'tree': 0.5662707816031978, 'nb': 0.3712714770120866}, {'size': (2096, 5), 'name': 'KC1.arff', 'log2D': 5, 'SVM': 0.7981115664115925, 'tree': 0.5271977737165022, 'nb': 0.4407297194113099}, {'size': (200, 6), 'name': 'KC3.arff', 'log2D': 6, 'SVM': 0.5157904411764707, 'tree': 0.5330989583333333, 'nb': 0.47273897058823533}, {'size': (9277, 6), 'name': 'MC1.arff', 'log2D': 6, 'SVM': 0.8804181680390609, 'tree': 0.8112091376901513, 'nb': 0.6709705630694425}, {'size': (127, 6), 'name': 'MC2.arff', 'log2D': 6, 'SVM': 0.5338194444444444, 'tree': 0.5520694444444445, 'nb': 0.6011180555555556}, {'size': 

In [9]:
data = {key:[]for key in res_list[0].keys()}
for one in res_list:
    for key,value in one.items():
        data[key].append(value)
data.pop("size")
data = pd.DataFrame(data,index=range(1,13))
data.to_csv("log2D_2.csv")