In [2]:
"""
对向量之间的距离进行度量:
1.使用一阶范数进行度量;
2.使用欧式距离进行度量;
3.使用Mahalanobis距离进行度量;
"""

import numpy as np
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

# 向量一阶范数距离度量
# 将两个向量各个维度进行作差取绝对值,并且累加作为两个向量之间的距离
# data:data1,data2,且数据类型为np.adarray
# 数据的shape为(n_samples,n_features):行向量为每一个向量数据样本,列向量为所有数据样本的某个特征字段
# normalized为是否对数据样本中的每一个特征字段进行Z-SCORE标准化
def abs_vec_dist(data1,data2,normalized=False):
    dist_list = list()
    
    if normalized:
        data1 = scale(data1)
        data2 = scale(data2)
        
    for (vec1,vec2) in zip(data1,data2):
#         print("vec1:{},vec2:{}".format(vec1,vec2))
        temp_ret = abs(vec1-vec2)
#         print("temp_ret:{}".format(temp_ret))
        sum_abs_dimension_diff = sum(temp_ret)
#         print("temp_dist:{}".format(sum_abs_dimension_diff))
#         print("*********")
        dist_list.append(sum_abs_dimension_diff)
        
    return dist_list
    
    
# 向量二阶范数,欧几里的距离
# 将两个向量各个维度进行作差取平方,累加之后并开方作为两个向量之间的距离
# data:data1,data2,且数据类型为np.adarray
# 数据的shape为(n_samples,n_features):行向量为每一个向量数据样本,列向量为所有数据样本的某个特征字段
# normalized为是否对数据样本中的每一个特征字段进行Z-SCORE标准化
def eculidean_vec_dist(data1,data2,normalized=False):
    dist_list = list() 
    if normalized:
        data1 = scale(data1)
        data2 = scale(data2)
           
    for (vec1,vec2) in zip(data1,data2):
        dist_list.append(np.sqrt(np.sum(np.power((vec1-vec2),2))))
        
    return dist_list

# # mahalanobis距离
# # 将需要比较的向量转换为各个维度是线性无关(使用PCA方法),并且各个维度进行了标准化后(使用sklearn.preprocessing中的scale方法)
# # 转换为新的向量之后使用欧式距离进行度量
# # data:data1,data2,且数据类型为np.adarray
# # 数据的shape为(n_samples,n_features):行向量为每一个向量数据样本,列向量为所有数据样本的某个特征字段
# # normalized为是否对数据样本中的每一个特征字段进行Z-SCORE标准化
# def mahalanobis_vec_dist(data1,data2,normalized=False):
#     dist_list = list()
#     if normalized:
#         data1 = scale(data1)
#         data2 = scale(data2)
        
#     pca_model = PCA()
#     #PCA使用训练数据data1进行     
#     pca_model.fit(X=data1)
#     #transform方法将数据转换为各个维度线性无关的数据
#     pca_data1 = pca_model.transform(data1)
#     pca_data2 = pca_model.transform(data2)
#     print("pca_data1.shape:{},pca_data2.shape:{}".format(pca_data1.shape,pca_data2.shape))
#     print("pca_data1\n:{}\npca_data2\n:{}".format(pca_data1,pca_data2))
    
    
#     #通过Z-SCORE标准化方法将线性无关数据进一步标准化,用以计算mahalanobis     
#     new_data1 = scale(pca_data1)
#     new_data2 = scale(pca_data2)
#     print("new_data1.shape:{},new_data2.shape:{}".format(new_data1.shape,new_data2.shape))
#     print("new_data1\n:{}\nnew_data2\n:{}".format(new_data1,new_data2))
    
    
#     #对于转化后数据的欧几里的距离就是mahalanobis距离
#     #ref:https://zhuanlan.zhihu.com/p/46626607
#     dist_list = eculidean_vec_dist(new_data1,new_data2)
    
#     return dist_list

# the data1 is training data
def mahalanobis_vec_dist(data1,data2,normalized=False):
    dist_list = list()
    if normalized:
        data1 = scale(data1)
        data2 = scale(data2)
    
    #求训练数据的协方差矩阵     
    train_cov_matrix = np.cov(data1.T)
    #求训练数据协方差矩阵的逆据阵
    inv_train_cov_matrix = np.linalg.inv(train_cov_matrix)
    
    for (vec1,vec2) in zip(data1,data2):
        diff_vec = vec1 - vec2
        temp_ret = np.dot(diff_vec.T,inv_train_cov_matrix,diff_vec)
        ret = np.dot()
        dist_list.append(np.sqrt(temp_ret))
        
    return dist_list

In [3]:
v1 = np.array([[2,5,7],[4,7,8]])
v2 = np.array([[2,4,9],[5,6,7]])

In [4]:
scale_ret = scale(v1)

In [6]:
ret_abs_list = abs_vec_dist(v1,v2)
ret_ecu_list = eculidean_vec_dist(v1,v2)
# ret_mahala_list = mahalanobis_vec_dist(v1,v2)

In [8]:
print(ret_abs_list)
print(ret_ecu_list)
# print(ret_mahala_list)

[3, 3]
[2.23606797749979, 1.7320508075688772]


##### 1.使用PCA将原始数据进行特征变换,变换为各个特征之间是线性无关
##### 2.将变换之后的数据进行各个特征维度标准化(Z-SCORE)
##### 3.对数据之间的距离使用欧式距离进行度量

In [9]:
import numpy as np
test_matrix = np.random.random((20,5))

In [10]:
test_matrix.shape

(20, 5)

In [11]:
cov_ret = np.cov(test_matrix.T)

In [12]:
cov_ret.shape

(5, 5)

In [13]:
inv_cov_ret = np.linalg.inv(cov_ret)

In [14]:
inv_cov_ret.shape

(5, 5)

In [15]:
cov_ret

array([[ 0.07184009, -0.00015561,  0.02020436,  0.00861818, -0.02785314],
       [-0.00015561,  0.06933355, -0.02095867,  0.01431836,  0.01364422],
       [ 0.02020436, -0.02095867,  0.06664079, -0.00257308,  0.00351958],
       [ 0.00861818,  0.01431836, -0.00257308,  0.09476555,  0.01717466],
       [-0.02785314,  0.01364422,  0.00351958,  0.01717466,  0.06420095]])

In [16]:
inv_cov_ret

array([[21.157503  , -4.00447015, -8.41929014, -3.62483681, 11.46133421],
       [-4.00447015, 17.90009973,  7.09746144, -1.12752955, -5.62896923],
       [-8.41929014,  7.09746144, 20.19799579,  1.44793625, -6.65565317],
       [-3.62483681, -1.12752955,  1.44793625, 11.92584518, -4.60269404],
       [11.46133421, -5.62896923, -6.65565317, -4.60269404, 23.3409615 ]])

In [17]:
arr1 = np.array([1,5,7,8,6])
arr2 = np.array([5,8,7,4,2])
vec_diff = arr1 - arr2

In [18]:
vec_diff

array([-4, -3,  0,  4,  4])

In [19]:
vec_diff.shape

(5,)

In [22]:
a = np.dot(vec_diff.T,inv_cov_ret)
b = np.dot(a,vec_diff)

In [23]:
b

731.8844135964315

In [2]:
import numpy as np

l = [1,5,7,8,9]
arr_l = np.array(l)

In [3]:
arr_l.shape

(5,)