In [2]:
import pandas as pd
from scipy.io import arff
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score,cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import os
import time
from tqdm import tqdm

In [3]:
import warnings 
warnings.filterwarnings('ignore')

In [4]:
file_pwd = os.getcwd() + "\Data"
res = os.walk(file_pwd)
file_list = [i[2] for i in res][0]
file_list

['CM1.arff',
 'JM1.arff',
 'KC1.arff',
 'KC3.arff',
 'MC1.arff',
 'MC2.arff',
 'MW1.arff',
 'PC1.arff',
 'PC2.arff',
 'PC3.arff',
 'PC4.arff',
 'PC5.arff']

In [5]:
#输入data,输出log2D列的数据
def preprocess(df):
    head_list = df.columns.values.tolist()
    #标准化
    data_without_YN = df.drop("Defective",axis = 1)
    data_normalize = (data_without_YN-data_without_YN.mean())/(data_without_YN.std())
    data_normalize['Defective'] = df.Defective
    
    row_yes_data = df[df.Defective == b'Y']
    row_yes_data = row_yes_data.drop("Defective",axis = 1).values
    row_no_data =  df[df.Defective == b'N']
    row_no_data = row_no_data.drop("Defective",axis = 1).values
    
    yes_samples = data_normalize[data_normalize.Defective == b"Y"]
    yes_samples = yes_samples.drop("Defective",axis = 1)
    no_samples = data_normalize[data_normalize.Defective == b"N"]
    no_samples = no_samples.drop("Defective",axis = 1)
    k = len(no_samples)//len(yes_samples)
    
    yes_samples_array = yes_samples.values
    no_samples_array = no_samples.values
    array = [[np.sqrt(np.sum(np.square(x-y))) for y in no_samples_array]for x in yes_samples_array]
    array = np.array(array).argsort()[:,:k]
    w = {i:0 for i in range(yes_samples.shape[1])}

    for i in range(array.shape[0]):
        for j in array[i]:
            ds = np.abs(row_yes_data[i,:] - row_no_data[j,:])
            ds = pd.Series(ds).rank(method='min')
            for index in range(len(ds)):
                w[index] += ds[index]
    
    a = sorted(w.items(),key=lambda x:x[1],reverse=True)
    b = [i[0] for i in a ]
    c = np.array(head_list)
    column = list(c[b])
    df2 = df.loc[:,column].copy()
    
    d = df2.shape[1]
    log2d = math.ceil(math.log2(d))
    df2 = df2.iloc[:,:log2d]
    return df2

In [6]:
#返回十次十折交叉验证的平均auc
def SVM(data,label):
    clf = SVC(gamma='auto')
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [7]:
#贝叶斯分类
def NB(data,label):
    clf = MultinomialNB()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)

In [8]:
#决策树分类
def DT(data,label):
    clf = DecisionTreeClassifier()
    auc_list = []
    data["label"] = label
    for i in tqdm(range(10)):
        data = data.sample(frac=1)
        scores = cross_val_score(clf,data.iloc[:,:-1],data.label,cv=10,scoring="roc_auc")
        auc_list.append(scores.mean())
    return np.mean(auc_list)


In [9]:
real_start = time.clock()
res_list = []
for each in file_list:
    data = arff.loadarff('./data/{}'.format(each))
    df = pd.DataFrame(data[0])
    if df.columns[-1] == "label":
        df.rename(columns={'label':'Defective'},inplace=True) 
    defective = df.Defective.copy()
    defective[defective==b'N'] = 0
    defective[defective==b'Y'] = 1
    
    start = time.clock()
    #分为数据和标签
    data = preprocess(df)
    label = defective.astype(int)
    svm_auc = SVM(data,label)
    destree_auc = DT(data,label)
    nb_auc = NB(data,label)
    print("*"*20)
    print("数据尺寸:{}".format(data.shape))
    print("文件名:{}".format(each))
    print("log2D:{}".format(data.shape[1]))
    print("SVM--->{}:".format(svm_auc))
    print("决策树--->{}:".format(destree_auc))
    print("贝叶斯--->{}".format(nb_auc))
    spend = (time.clock()-start)
    print("use time:{}".format(spend))
    print("="*20)
    make_dic = {
        "size":data.shape,
        "name":each,
        "log2D":data.shape[1],
        "SVM":svm_auc,
        "tree":destree_auc,
        "nb":nb_auc
    }
    res_list.append(make_dic)
print("总共耗时:",(time.clock()-real_start))
print(res_list)

100%|███████████████████████████████| 10/10 [00:01<00:00,  5.92it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 13.53it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.24it/s]


********************
数据尺寸:(344, 7)
文件名:CM1.arff
log2D:7
SVM--->0.555752688172043:
决策树--->0.5071263440860214:
贝叶斯--->0.3505497311827957
use time:3.7558976


100%|███████████████████████████████| 10/10 [09:39<00:00, 57.98s/it]
100%|███████████████████████████████| 10/10 [00:05<00:00,  1.77it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 10.01it/s]


********************
数据尺寸:(9593, 6)
文件名:JM1.arff
log2D:6
SVM--->0.6664491639691839:
决策树--->0.5637473672868878:
贝叶斯--->0.3707869263276976
use time:776.067861


100%|███████████████████████████████| 10/10 [00:18<00:00,  1.88s/it]
100%|███████████████████████████████| 10/10 [00:01<00:00,  6.91it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 13.75it/s]


********************
数据尺寸:(2096, 6)
文件名:KC1.arff
log2D:6
SVM--->0.796422445508215:
决策树--->0.521384014877397:
贝叶斯--->0.4403760890782708
use time:29.531004800000005


100%|███████████████████████████████| 10/10 [00:00<00:00, 10.69it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.81it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.39it/s]


********************
数据尺寸:(200, 7)
文件名:KC3.arff
log2D:7
SVM--->0.5135539215686274:
决策树--->0.5246001838235295:
贝叶斯--->0.4723268995098039
use time:2.6541952000000038


100%|███████████████████████████████| 10/10 [01:53<00:00, 11.31s/it]
100%|███████████████████████████████| 10/10 [00:01<00:00,  5.31it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 10.24it/s]


********************
数据尺寸:(9277, 7)
文件名:MC1.arff
log2D:7
SVM--->0.8835570262991663:
决策树--->0.8005816733375596:
贝叶斯--->0.6387463614972023
use time:133.29019879999998


100%|███████████████████████████████| 10/10 [00:00<00:00, 11.55it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.75it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 16.09it/s]


********************
数据尺寸:(127, 7)
文件名:MC2.arff
log2D:7
SVM--->0.5302777777777777:
决策树--->0.5499583333333333:
贝叶斯--->0.5976770833333334
use time:2.4511095999999952


100%|███████████████████████████████| 10/10 [00:01<00:00,  8.64it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 15.24it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.04it/s]


********************
数据尺寸:(264, 7)
文件名:MW1.arff
log2D:7
SVM--->0.5096195652173913:
决策树--->0.6037349033816425:
贝叶斯--->0.4241455314009661
use time:2.885296999999923


100%|███████████████████████████████| 10/10 [00:04<00:00,  2.14it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 12.94it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 16.02it/s]


********************
数据尺寸:(759, 7)
文件名:PC1.arff
log2D:7
SVM--->0.5583918712412501:
决策树--->0.5563629103815438:
贝叶斯--->0.3184344868382135
use time:7.1608764000000065


100%|███████████████████████████████| 10/10 [00:08<00:00,  1.14it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 12.84it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.97it/s]


********************
数据尺寸:(1585, 7)
文件名:PC2.arff
log2D:7
SVM--->0.8841944308345582:
决策树--->0.5088165523436224:
贝叶斯--->0.2988535031847134
use time:11.744186200000058


100%|███████████████████████████████| 10/10 [00:10<00:00,  1.05s/it]
100%|███████████████████████████████| 10/10 [00:01<00:00,  9.10it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.32it/s]


********************
数据尺寸:(1125, 7)
文件名:PC3.arff
log2D:7
SVM--->0.6355518008069029:
决策树--->0.6040268574962452:
贝叶斯--->0.509480593103042
use time:15.032965600000011


100%|███████████████████████████████| 10/10 [00:13<00:00,  1.37s/it]
100%|███████████████████████████████| 10/10 [00:01<00:00,  8.43it/s]
100%|███████████████████████████████| 10/10 [00:00<00:00, 14.62it/s]


********************
数据尺寸:(1399, 7)
文件名:PC4.arff
log2D:7
SVM--->0.6249232964766163:
决策树--->0.5518660640319035:
贝叶斯--->0.41187264636629006
use time:18.954724599999963


100%|███████████████████████████████| 10/10 [03:04<00:00, 18.50s/it]
100%|███████████████████████████████| 10/10 [00:02<00:00,  3.85it/s]
100%|███████████████████████████████| 10/10 [00:01<00:00,  9.03it/s]


********************
数据尺寸:(17001, 7)
文件名:PC5.arff
log2D:7
SVM--->0.9492571822775971:
决策树--->0.6204143409351244:
贝叶斯--->0.19727954325547778
use time:299.4676970999999
总共耗时: 1305.5131313
[{'size': (344, 7), 'name': 'CM1.arff', 'log2D': 7, 'SVM': 0.555752688172043, 'tree': 0.5071263440860214, 'nb': 0.3505497311827957}, {'size': (9593, 6), 'name': 'JM1.arff', 'log2D': 6, 'SVM': 0.6664491639691839, 'tree': 0.5637473672868878, 'nb': 0.3707869263276976}, {'size': (2096, 6), 'name': 'KC1.arff', 'log2D': 6, 'SVM': 0.796422445508215, 'tree': 0.521384014877397, 'nb': 0.4403760890782708}, {'size': (200, 7), 'name': 'KC3.arff', 'log2D': 7, 'SVM': 0.5135539215686274, 'tree': 0.5246001838235295, 'nb': 0.4723268995098039}, {'size': (9277, 7), 'name': 'MC1.arff', 'log2D': 7, 'SVM': 0.8835570262991663, 'tree': 0.8005816733375596, 'nb': 0.6387463614972023}, {'size': (127, 7), 'name': 'MC2.arff', 'log2D': 7, 'SVM': 0.5302777777777777, 'tree': 0.5499583333333333, 'nb': 0.5976770833333334}, {'size': (264, 7

In [10]:
data = {key:[]for key in res_list[0].keys()}
for one in res_list:
    for key,value in one.items():
        data[key].append(value)
data.pop("size")
data = pd.DataFrame(data,index=range(1,13))
data

Unnamed: 0,name,log2D,SVM,tree,nb
1,CM1.arff,7,0.555753,0.507126,0.35055
2,JM1.arff,6,0.666449,0.563747,0.370787
3,KC1.arff,6,0.796422,0.521384,0.440376
4,KC3.arff,7,0.513554,0.5246,0.472327
5,MC1.arff,7,0.883557,0.800582,0.638746
6,MC2.arff,7,0.530278,0.549958,0.597677
7,MW1.arff,7,0.50962,0.603735,0.424146
8,PC1.arff,7,0.558392,0.556363,0.318434
9,PC2.arff,7,0.884194,0.508817,0.298854
10,PC3.arff,7,0.635552,0.604027,0.509481
