In [1]:
from numba import cuda,njit, types, typed
import pandas as pd
import numpy as np
from tqdm import tqdm
import numpy as np
import math

In [2]:
import json
with open('../roads_dict3.json') as f:
    road_data = json.load(f)

In [3]:
with open('./processed_data/processed_data2.json') as f:
    line_data = json.load(f)

In [4]:
line_data[0]

[[[119.279725, 26.07737],
  [119.278915, 26.080015],
  [119.27827, 26.082518],
  [119.278243, 26.082715],
  [119.277613, 26.085935],
  [119.277358, 26.08976],
  [119.27741, 26.093143],
  [119.277602, 26.095508],
  [119.276643, 26.100297],
  [119.278622, 26.10405],
  [119.282238, 26.106752]],
 [119.282238, 26.106752]]

In [5]:
# def computeDist(A,B):
#     return np.sqrt(sum(np.power((A - B), 2)))    
def computeDist(pt1, pt2):  
    """ 
    计算两个数据点的距离 
    return:pt1和pt2之间的距离 
    """  
    sum = 0.0  
    for i in range(len(pt1)):  
        sum = sum + (pt1[i] - pt2[i]) * (pt1[i] - pt2[i])  
    return math.sqrt(sum)  

In [6]:
class KD_node:  
    def __init__(self, point=None, split=None, LL = None, RR = None):  
        """ 
        point:数据点 
        split:划分域 
        LL, RR:节点的左儿子跟右儿子 
        """  
        self.point = point  
        self.split = split  
        self.left = LL  
        self.right = RR  
def createKDTree(root, data_list):  
    """ 
    root:当前树的根节点 
    data_list:数据点的集合(无序) 
    return:构造的KDTree的树根 
    """  
    LEN = len(data_list)  
    if LEN == 0:  
        return  
    #数据点的维度  
    dimension = len(data_list[0])  
    #方差  
    max_var = 0  
    #最后选择的划分域  
    split = 0;  
    for i in range(dimension):  
        ll = []  
        for t in data_list:  
            ll.append(t[i])  
        var = computeVariance(ll)  
        if var > max_var:  
            max_var = var  
            split = i  
    #根据划分域的数据对数据点进行排序  
    data_list.sort(key=lambda x: x[split])  
    #选择下标为len / 2的点作为分割点  
    point = data_list[LEN // 2]  
    root = KD_node(point, split)  
    root.left = createKDTree(root.left, data_list[0:(LEN // 2)])  
    root.right = createKDTree(root.right, data_list[(LEN // 2 + 1):LEN])  
    return root  
  


In [7]:
def computeVariance(arrayList):  
    """ 
    arrayList:存放的数据点 
    return:返回数据点的方差 
    """  
    for ele in arrayList:  
        ele = float(ele)  
    LEN = len(arrayList)  
    array = numpy.array(arrayList)  
    sum1 = array.sum()  
    array2 = array * array  
    sum2 = array2.sum()  
    mean = sum1 / LEN  
    #D[X] = E[x^2] - (E[x])^2  
    variance = sum2 / LEN - mean**2  
    return variance 

In [8]:
def findNN(root, query):  
    """ 
    root:KDTree的树根 
    query:查询点 
    return:返回距离data最近的点NN，同时返回最短距离min_dist 
    """  
    #初始化为root的节点  
    NN = root.point  
    min_dist = computeDist(query, NN)  
    nodeList = []  
    temp_root = root  
    ##二分查找建立路径  
    while temp_root:  
        nodeList.append(temp_root)  
        dd = computeDist(query, temp_root.point)  
        if min_dist > dd:  
            NN = temp_root.point  
            min_dist = dd  
        #当前节点的划分域  
        ss = temp_root.split  
        if query[ss] <= temp_root.point[ss]:  
            temp_root = temp_root.left  
        else:  
            temp_root = temp_root.right  
    ##回溯查找  
    while nodeList:  
        #使用list模拟栈，后进先出  
        back_point = nodeList.pop()  
        ss = back_point.split  
#         print ("back.point = ", back_point.point  )
        ##判断是否需要进入父亲节点的子空间进行搜索  
        if abs(query[ss] - back_point.point[ss]) < min_dist:  
            if query[ss] <= back_point.point[ss]:  
                temp_root = back_point.right  
            else:  
                temp_root = back_point.left  
  
            if temp_root:  
                nodeList.append(temp_root)  
                curDist = computeDist(query, temp_root.point)  
                if min_dist > curDist:  
                    min_dist = curDist  
                    NN = temp_root.point  
    return NN, min_dist  


In [9]:
root = KD_node()

In [10]:
import numpy 
root = createKDTree(root,road_data)

In [11]:
findNN(root,[119.326755, 26.073772])

([119.32657401, 26.07369501], 0.00019668462116161864)

In [12]:
saver = []
for line,target in tqdm(line_data[:]):
    row_list = []
    new_line_data = []
    for ptr in line:
        new_line_data.append(findNN(root,ptr)[0])
    row_list.append(new_line_data[:-1])
    row_list.append(findNN(root,target)[0])
    saver.append(row_list)

100%|██████████| 3802221/3802221 [19:37<00:00, 3229.25it/s] 


In [13]:
reshape = []
for line,target in tqdm(saver[:]):
    reshape += line
    reshape += [target]

100%|██████████| 3802221/3802221 [00:01<00:00, 1957184.63it/s]


In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(reshape)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [15]:
new_saver = []
for line,target in tqdm(saver[:]):
    alist = []
    alist.append(scaler.transform(line).tolist())
    alist.append(scaler.transform([target])[0].tolist())
    new_saver.append(alist)

100%|██████████| 3802221/3802221 [06:12<00:00, 10210.85it/s]


In [17]:
with open('processed_data/step2_data.json','w') as f:
    json.dump(new_saver,f)

In [18]:
import pickle
with open('../data_preprocess/scaler' ,'wb') as f:
    scaler = pickle.dump(scaler,f)

In [23]:
new_saver[110]

[[[0.42722955411750263, 0.5890929897629746],
  [0.42722955411750263, 0.5890929897629746],
  [0.42722955411750263, 0.5890929897629746],
  [0.4252370276706188, 0.5888461310905768],
  [0.4196867449622914, 0.5876848124792104],
  [0.4194998843555595, 0.5881870043111519],
  [0.41883411177400376, 0.5880561205649428],
  [0.41883411177400376, 0.5880561205649428],
  [0.41883411177400376, 0.5880561205649428],
  [0.4095875607461039, 0.5961122191590107]],
 [0.4165567209056462, 0.59860214903577]]