In [74]:
import pandas as pd
import numpy as np

In [75]:
df = pd.read_csv('../data/process_v1/process_data_5.csv')

In [77]:
lat_lon_df = df[['lat', 'lon']].drop_duplicates()
lat_lon_df

Unnamed: 0,lat,lon
0,21.036403,105.727235
1,20.985272,105.843605
2,20.978546,105.796775
3,21.028933,105.855453
4,21.046094,105.800236
...,...,...
169898,10.761879,106.700807
170206,10.875113,106.652996
170571,10.745290,106.539112
170588,10.778074,106.685171


In [78]:
(lat_lon_df['lon'].min(), lat_lon_df['lon'].max())

(105.3065795, 106.969481)

In [79]:
import quads

In [80]:
tree = quads.QuadTree((15.86914545, 106.13803025), 12 , 2)

In [81]:
lat_list, lon_list = lat_lon_df['lat'].tolist(), lat_lon_df['lon'].tolist()
lat_lon_tuple_list = [(lat, lon) for lat, lon in zip(lat_list, lon_list)]
print(len(lat_lon_tuple_list))

3848


In [82]:
for lat_lon_tuple in lat_lon_tuple_list:
    tree.insert(lat_lon_tuple)

In [83]:
lat_lon_tuple in tree

True

In [84]:
found = tree.nearest_neighbors(lat_lon_tuple, count=4)
print(found[0].x, found[0].y)

10.728533 106.6538638


In [85]:
def get_address_by_lat_lon(lat, lon):
    item_df = df[(df['lat'] == lat) & (df['lon'] == lon)]
    return item_df['district'].mode().tolist()[0], item_df['street'].mode().tolist()[0], item_df['ward'].mode().tolist()[0]

In [86]:
def find_nearest_neighbors(lat, lon, count = 10):
    found = tree.nearest_neighbors((lat, lon), count=count)
    nearest_lat_lon_df = pd.DataFrame()
    nearest_lat_lon_df['lat'] = [item.x for item in found]
    nearest_lat_lon_df['lon'] = [item.y for item in found]

    nearest_lat_lon_df['district'], nearest_lat_lon_df['street'], nearest_lat_lon_df['ward'] = zip(*nearest_lat_lon_df.apply(lambda x: get_address_by_lat_lon(x['lat'], x['lon']), axis = 1))

    arr = np.concatenate(nearest_lat_lon_df.values)
    return arr

In [87]:
arrs = [find_nearest_neighbors(lat_lon_tuple[0], lat_lon_tuple[1]) for lat_lon_tuple in lat_lon_tuple_list]
nearest_df = pd.DataFrame(arrs)

In [88]:
nearest_df = pd.DataFrame(arrs)
nearest_df = nearest_df.rename(columns = {0: 'lat', 1: 'lon'})
del nearest_df[2],nearest_df[3], nearest_df[4]
cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon']]

lat_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 0]
format_lat_cols = [f'nearest_{i}_lat' for i, col in enumerate(lat_cols)]

lon_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 1]
format_lon_cols = [f'nearest_{i}_lon' for i, col in enumerate(lon_cols)]

district_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 2]
format_district_cols = [f'nearest_{i}_district' for i, col in enumerate(district_cols)]

street_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 3]
format_street_cols = [f'nearest_{i}_street' for i, col in enumerate(street_cols)]

ward_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 4]
format_ward_cols = [f'nearest_{i}_ward' for i, col in enumerate(ward_cols)]

source_cols_list = [lat_cols, lon_cols, district_cols, street_cols, ward_cols]
target_cols_list = [format_lat_cols, format_lon_cols, format_district_cols, format_street_cols, format_ward_cols]


for source_cols, target_cols in zip(source_cols_list, target_cols_list):
    rename_dict = dict(zip(source_cols, target_cols))
    nearest_df = nearest_df.rename(columns = rename_dict)
nearest_df

Unnamed: 0,lat,lon,nearest_0_lat,nearest_0_lon,nearest_0_district,nearest_0_street,nearest_0_ward,nearest_1_lat,nearest_1_lon,nearest_1_district,...,nearest_7_lat,nearest_7_lon,nearest_7_district,nearest_7_street,nearest_7_ward,nearest_8_lat,nearest_8_lon,nearest_8_district,nearest_8_street,nearest_8_ward
0,21.036403,105.727235,21.033568,105.727680,0.0,203.0,0.0,21.033979,105.731007,0.0,...,21.036313,105.740984,15.0,1678.0,70.0,21.049022,105.721264,0.0,625.0,207.0
1,20.985272,105.843605,20.983726,105.842452,1.0,604.0,1.0,20.983726,105.845478,1.0,...,20.983778,105.849492,1.0,234.0,162.0,20.980834,105.847804,1.0,1288.0,162.0
2,20.978546,105.796775,20.978724,105.796025,2.0,190.0,2.0,20.981949,105.800781,20.0,...,,,,,,,,,,
3,21.028933,105.855453,21.029023,105.856556,3.0,2253.0,3.0,21.030715,105.856206,3.0,...,,,,,,,,,,
4,21.046094,105.800236,21.043727,105.799349,4.0,314.0,4.0,21.044754,105.797760,4.0,...,21.043878,105.794282,4.0,223.0,149.0,21.052238,105.797793,5.0,815.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3843,10.761879,106.700807,10.762198,106.700576,38.0,3961.0,155.0,10.761890,106.702011,38.0,...,10.764510,106.701012,38.0,4077.0,644.0,10.763022,106.703210,38.0,3259.0,644.0
3844,10.875113,106.652996,10.875485,106.651851,32.0,1264.0,400.0,10.875085,106.655459,32.0,...,10.881358,106.650859,32.0,2234.0,400.0,10.882365,106.652234,32.0,3824.0,400.0
3845,10.745290,106.539112,10.741033,106.537214,29.0,3139.0,208.0,10.733598,106.534765,29.0,...,,,,,,,,,,
3846,10.778074,106.685171,10.777181,106.684210,13.0,1520.0,302.0,10.779361,106.685437,13.0,...,10.775294,106.686401,13.0,1684.0,372.0,10.775858,106.682920,13.0,3390.0,21.0


In [89]:
df = df.merge(nearest_df, how='left', on = ['lat', 'lon'])
df

Unnamed: 0,numberOfFloors,numberOfBathRooms,numberOfBedRooms,certificateOfLandUseRight,ward,street,district,city,lat,lon,...,nearest_7_lat,nearest_7_lon,nearest_7_district,nearest_7_street,nearest_7_ward,nearest_8_lat,nearest_8_lon,nearest_8_district,nearest_8_street,nearest_8_ward
0,1,,,0,0,0,0,0,21.036403,105.727235,...,21.036313,105.740984,15.0,1678.0,70.0,21.049022,105.721264,0.0,625.0,207.0
1,5,5.0,4.0,1,1,1,1,0,20.985272,105.843605,...,20.983778,105.849492,1.0,234.0,162.0,20.980834,105.847804,1.0,1288.0,162.0
2,4,,,1,2,2,2,0,20.978546,105.796775,...,,,,,,,,,,
3,7,7.0,7.0,1,3,3,3,0,21.028933,105.855453,...,,,,,,,,,,
4,4,,,1,4,4,4,0,21.046094,105.800236,...,21.043878,105.794282,4.0,223.0,149.0,21.052238,105.797793,5.0,815.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170959,1,2.0,2.0,0,119,600,9,0,21.001092,105.869070,...,21.006290,105.870904,9.0,1287.0,241.0,20.995883,105.874019,9.0,154.0,119.0
170960,6,10.0,10.0,0,231,358,11,0,20.999584,105.817799,...,21.004261,105.816721,11.0,744.0,231.0,21.002390,105.821978,11.0,541.0,96.0
170961,8,22.0,22.0,1,164,574,10,0,21.029128,105.886118,...,21.038559,105.876178,10.0,112.0,79.0,21.041930,105.880878,10.0,1100.0,79.0
170962,5,,,1,35,125,15,0,21.017355,105.775199,...,21.010092,105.771774,15.0,2839.0,145.0,21.009664,105.771176,15.0,1592.0,174.0


In [90]:
df

Unnamed: 0,numberOfFloors,numberOfBathRooms,numberOfBedRooms,certificateOfLandUseRight,ward,street,district,city,lat,lon,...,nearest_7_lat,nearest_7_lon,nearest_7_district,nearest_7_street,nearest_7_ward,nearest_8_lat,nearest_8_lon,nearest_8_district,nearest_8_street,nearest_8_ward
0,1,,,0,0,0,0,0,21.036403,105.727235,...,21.036313,105.740984,15.0,1678.0,70.0,21.049022,105.721264,0.0,625.0,207.0
1,5,5.0,4.0,1,1,1,1,0,20.985272,105.843605,...,20.983778,105.849492,1.0,234.0,162.0,20.980834,105.847804,1.0,1288.0,162.0
2,4,,,1,2,2,2,0,20.978546,105.796775,...,,,,,,,,,,
3,7,7.0,7.0,1,3,3,3,0,21.028933,105.855453,...,,,,,,,,,,
4,4,,,1,4,4,4,0,21.046094,105.800236,...,21.043878,105.794282,4.0,223.0,149.0,21.052238,105.797793,5.0,815.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170959,1,2.0,2.0,0,119,600,9,0,21.001092,105.869070,...,21.006290,105.870904,9.0,1287.0,241.0,20.995883,105.874019,9.0,154.0,119.0
170960,6,10.0,10.0,0,231,358,11,0,20.999584,105.817799,...,21.004261,105.816721,11.0,744.0,231.0,21.002390,105.821978,11.0,541.0,96.0
170961,8,22.0,22.0,1,164,574,10,0,21.029128,105.886118,...,21.038559,105.876178,10.0,112.0,79.0,21.041930,105.880878,10.0,1100.0,79.0
170962,5,,,1,35,125,15,0,21.017355,105.775199,...,21.010092,105.771774,15.0,2839.0,145.0,21.009664,105.771176,15.0,1592.0,174.0


In [94]:
import math
from tqdm import tqdm
def distance_func(lat1: float, lon1: float, lat2: float, lon2: float):

    try:
        R = 6371
        dLat = (lat2-lat1) * math.pi / 180
        dLon = (lon2-lon1) * math.pi / 180
        lat1 = lat1 * math.pi / 180
        lat2 = lat2 * math.pi / 180
        a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * \
            math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        d = R * c
        return d*1000
    except:
        print(f"Distance_Func error with: {(lat1, lon1, lat2, lon2)}")


for i in tqdm(range(9)):
    df[f'distance_nearest_{i}'] = df.apply(lambda x: distance_func(x['lat'], x['lon'], x[f'nearest_{i}_lat'], x[f'nearest_{i}_lon']), axis = 1)
    df[f'distance_nearest_{i}'] = np.log(df[f'distance_nearest_{i}'])

100%|██████████| 9/9 [00:37<00:00,  4.21s/it]


In [95]:
df

Unnamed: 0,numberOfFloors,numberOfBathRooms,numberOfBedRooms,certificateOfLandUseRight,ward,street,district,city,lat,lon,...,nearest_8_ward,distance_nearest_0,distance_nearest_1,distance_nearest_2,distance_nearest_3,distance_nearest_4,distance_nearest_5,distance_nearest_6,distance_nearest_7,distance_nearest_8
0,1,,,0,0,0,0,0,21.036403,105.727235,...,207.0,5.763842,6.163906,6.625483,6.902591,7.068264,7.244181,7.234741,7.263303,7.335576
1,5,5.0,4.0,1,1,1,1,0,20.985272,105.843605,...,162.0,5.344622,5.558940,5.666665,5.678318,6.172367,6.381466,6.450677,6.451070,6.489909
2,4,,,1,2,2,2,0,20.978546,105.796775,...,,4.386321,6.332002,,,,,,,
3,7,7.0,7.0,1,3,3,3,0,21.028933,105.855453,...,,4.744202,5.361315,5.435177,5.625682,5.748205,5.755464,,,
4,4,,,1,4,4,4,0,21.046094,105.800236,...,8.0,5.630519,5.693591,6.114641,6.244196,6.311831,6.423785,6.490146,6.500025,6.591297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170959,1,2.0,2.0,0,119,600,9,0,21.001092,105.869070,...,119.0,5.476523,5.845523,5.958344,6.267626,6.313399,6.335890,6.338963,6.411076,6.651899
170960,6,10.0,10.0,0,231,358,11,0,20.999584,105.817799,...,96.0,5.500126,5.593771,5.674128,5.784006,5.991982,6.096160,6.057130,6.276572,6.281072
170961,8,22.0,22.0,1,164,574,10,0,21.029128,105.886118,...,79.0,4.963666,6.570718,6.683942,6.744535,7.010729,7.139913,7.087267,7.293691,7.328976
170962,5,,,1,35,125,15,0,21.017355,105.775199,...,174.0,5.233229,5.755187,6.184113,6.250902,6.252426,6.349551,6.788466,6.782641,6.858257


In [97]:
df.to_csv('../data/process_v1/process_data_6.csv', index = False)