# 数据预处理：异常值、过采样和特征排序

In [1]:
#Import the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Importing the Dataset
Dataset = pd.read_csv("../code/Data/german/german.csv")

## 1.异常值处理：三倍标准差探测法

In [3]:
for i in ['A2','A5','A13']:
    std = np.std(Dataset[i])
    mean = np.mean(Dataset[i])
    b = 3
    lower_limit = mean-b*std
    upper_limit = mean+b*std
    Dataset=Dataset.drop(Dataset[(Dataset[i]>upper_limit)|(Dataset[i]<lower_limit)].index)
X = Dataset.iloc[:, :-1]
y = Dataset.iloc[:, [-1]]

## 2.字符型数据编码

In [4]:
#list all the string value and convert them in to int values!!
#Note we are doing only labelencoder 

from sklearn.preprocessing import LabelEncoder

X =X.values
y =y.values

labelencoder = LabelEncoder()
for i in range(20):
    if (i not in [1,4,7,10,12,15,17]):
        X[:,i]=labelencoder.fit_transform(X[:, i])

## 3.基于最近邻的SMOTE抽样

In [5]:
y=y.reshape(1,-1)
y=pd.Series(y[0,:])
X=pd.DataFrame(X)
new_Dataset=pd.concat([X,y],axis=1,ignore_index=True)

In [6]:
import random
from sklearn.neighbors import NearestNeighbors
import numpy as np
class Smote:
    def __init__(self,samples,N=10,k=5):
        self.n_samples,self.n_attrs=samples.shape
        self.N=N
        self.k=k
        self.samples=samples
        self.newindex=0
       # self.synthetic=np.zeros((self.n_samples*N,self.n_attrs))

    def over_sampling(self):
        N=int(self.N/100)
        self.synthetic = np.zeros((self.n_samples * N, self.n_attrs))
        neighbors=NearestNeighbors(n_neighbors=self.k).fit(self.samples)
        print('neighbors',neighbors)
        for i in range(len(self.samples)):
            nnarray=neighbors.kneighbors(self.samples[i].reshape(1,-1),return_distance=False)[0]
            #print nnarray
            self._populate(N,i,nnarray)
        return self.synthetic


    # for each minority class samples,choose N of the k nearest neighbors and generate N synthetic samples.
    def _populate(self,N,i,nnarray):
        for j in range(N):
            nn=random.randint(0,self.k-1)
            dif=self.samples[nnarray[nn]]-self.samples[i]
            gap=random.random()
            self.synthetic[self.newindex]=self.samples[i]+gap*dif
            self.newindex+=1

# print(new_Dataset.iloc[(new_Dataset[20]==2).index])
# print((new_Dataset[20]==[2]).index)
a=new_Dataset.iloc[new_Dataset[new_Dataset[20]==2].index]
# a=np.array([[1,2,3],[4,5,6],[2,3,1],[2,1,2],[2,3,4],[2,3,4]])
a=np.array(a.iloc[:,:-1])
s=pd.DataFrame(Smote(a,N=200).over_sampling())
z=pd.Series([2]*len(s))
s=pd.concat([pd.DataFrame(s),z],axis=1,ignore_index=True)
all_Dataset=new_Dataset.append(s)
all_Dataset.columns=Dataset.columns

X = all_Dataset.iloc[:, :-1]
y = all_Dataset.iloc[:, [-1]]

neighbors NearestNeighbors()


## 4.Relief F 特征选择算法

In [7]:
import numpy as np
from random import randrange


def relief(features, labels, times):   # 传入特征矩阵，标签矩阵和随机选择的次数，因为有可能样本有很多，所以我们随机选择若干个样本来计算;这里的矩阵都是np.array
    (n_samples, n_features) = np.shape(features)
    delta = []
    delta_features = []
    delta_index = []
    sample_distance = sap_distance(features)  # 计算每两个样本之间的距离
    new_features = normalize(features)        # 对特征值归一化

    """
    # 下面开始计算相关统计量，并对各个特征的相关统计量进行比较，最后返回各个特征值相关统计量从高到低的排名
    # 这是将随机选取的样本代入计算出来的delta
    for i in range(0, times):
        randnum = randrange(0, n_samples, 1)        # 生成一个随机数
        one_sample = features[randnum]        # 随机选择一个样本
        (nearhit, nearmiss, nearhit_index, nearmiss_index) = search_near(sample_distance, labels, randnum, features)  # 找出猜中近邻和猜错近邻,nearhit为猜中近邻样本的行向量
        delta.append(relevant_feature(nearhit_index, nearmiss_index, new_features, randnum))  # 计算相关统计量矩阵
    delta = np.asarray(delta)
    for j in range(0, n_features):
        delta_features.append(np.sum(delta[:, j]))
    midd = list(set(delta_features))
    midd.sort(reverse=True)
    for p in midd:
        for q in range(0, len(delta_features)):
            if delta_features[q] == p:
                delta_index.append(q)
    return delta_index
    """
    # 这是将所有样本都带入计算的delta
    for i in range(0, n_samples):
        (nearhit, nearmiss, nearhit_index, nearmiss_index) = search_near(sample_distance, labels, i,
                                                                         features)  # 找出猜中近邻和猜错近邻,nearhit为猜中近邻样本的行向量
        delta.append(relevant_feature(nearhit_index, nearmiss_index, new_features, i))  # 计算相关统计量矩阵
    delta = np.asarray(delta)
    for j in range(0, n_features):
        delta_features.append(np.sum(delta[:, j]))
    midd = list(set(delta_features))
    midd.sort(reverse=True)
    for p in midd:
        for q in range(0, len(delta_features)):
            if delta_features[q] == p:
                delta_index.append(q)
    return delta_index


def normalize(features):
    (n_samples, n_features) = np.shape(features)
    print("shape=", n_samples, n_features)
    fe_max = []
    fe_min = []
    n_deno = []
    new_features = np.zeros((n_samples, n_features))
    print("new_features=", new_features)
    for i in range(0, n_features):
        max_index = np.argmax(features[:, i])
        min_index = np.argmin(features[:, i])
        fe_max.append(features[max_index, i])  # 计算每一个特征的最大值
        fe_min.append(features[min_index, i])  # 计算每一个特征的最小值
    n_deno = np.asarray(fe_max) - np.asarray(fe_min)  # 求出归一化的分母
    for j in range(0, n_features):
        for k in range(0, n_samples):
            new_features[k, j] = (features[k, j]-fe_min[j]) / n_deno[j]  # 归一化
    return new_features

def sap_distance(features):
    (n_samples, n_features) = np.shape(features)
    distance = np.zeros((n_samples, n_samples))
    for i in range(0, n_samples):
        for j in range(0, n_samples):
            diff_distance = features[i]-features[j]
            if i == j:
                distance[i, j] = 9999
            else:
                distance[i, j] = euclid_distance(diff_distance)  # 使用欧几里德距离定义样本之间的距离
    print("距离：",distance)
    return distance

def euclid_distance(diff_distance):
    counter = np.power(diff_distance, 2)
    counter = np.sum(counter)
    counter = np.sqrt(counter)
    return counter

def search_near(sample_distance, labels, randnum, feartures):
    (n_samples, n_features) = np.shape(feartures)
    nearhit_list = []
    nearmiss_list = []
    hit_index = []
    miss_index = []
    for i in range(0, n_samples):
        if labels[i] == labels[randnum]:
            nearhit_list.append(sample_distance[i, randnum])  # 将距离放在一个列表里面
            hit_index.append(i)                                 # 将样本标号放在另一个列表里面
        else:
            nearmiss_list.append(sample_distance[i, randnum])
            miss_index.append(i)
    nearhit_dis_index = nearhit_list.index(min(nearhit_list))   # 算出猜中近邻
    nearhit_index = hit_index[nearhit_dis_index]                # 将猜中近邻的样本标号赋给nearhit_index

    nearmiss_dis_index = nearmiss_list.index(min(nearmiss_list))
    nearmiss_index = miss_index[nearmiss_dis_index]


    nearhit = feartures[nearhit_index]
    nearmiss = feartures[nearmiss_index]

    return nearhit, nearmiss, nearhit_index, nearmiss_index

def relevant_feature(nearhit_index, nearmiss_index, new_features, randnum):
    diff_hit = abs(new_features[nearhit_index]-new_features[randnum])
    diff_miss = abs(new_features[nearmiss_index]-new_features[randnum])
    delta = -np.power(diff_hit, 2)+np.power(diff_miss, 2)
    return delta



out_features = np.array(X)
labels = np.array(y)
times = 2
features_importance = relief(out_features, labels, times)
print("排序：", features_importance)

距离： [[9999.         4782.39992054  927.21788162 ... 7060.18597489
   670.90278512  659.23540895]
 [4782.39992054 9999.         3855.26549021 ... 2278.03797159
  4112.90101589 4124.51437241]
 [ 927.21788162 3855.26549021 9999.         ... 6133.09440658
   259.61288031  271.31671957]
 ...
 [7060.18597489 2278.03797159 6133.09440658 ... 9999.
  6390.85931442 6402.49385287]
 [ 670.90278512 4112.90101589  259.61288031 ... 6390.85931442
  9999.           18.0988994 ]
 [ 659.23540895 4124.51437241  271.31671957 ... 6402.49385287
    18.0988994  9999.        ]]
shape= 1503 20
new_features= [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
排序： [0, 18, 5, 13, 10, 17, 11, 7, 3, 2, 6, 14, 8, 9, 16, 12, 1, 15, 4, 19]


In [8]:
new_columns=[]
for num in features_importance[:20]:# 17的效果最好
    label='A'+str(num+1)
    new_columns.append(label)
new_X=X.loc[:,new_columns]

data_new=pd.concat([new_X,y],axis=1)

In [9]:
data_new.to_csv('../code/Data/german/data.csv')