In [98]:
import pandas as pd
import numpy as np

In [99]:
df = pd.read_csv('../data/process_v1/process_data_5.csv')

In [100]:
lat_lon_df = df[['lat', 'lon']].drop_duplicates()
lat_lon_df

Unnamed: 0,lat,lon
0,10.860736,106.739182
1,10.722855,106.714168
2,21.008299,105.774104
3,20.998322,105.867558
4,21.357386,105.823221
...,...,...
170131,10.817858,106.770901
170182,10.594044,106.728478
170244,10.786360,106.652100
170277,10.744984,106.705015


In [101]:
(lat_lon_df['lon'].min(), lat_lon_df['lon'].max())

(105.3065795, 106.969481)

In [102]:
import quads

In [103]:
tree = quads.QuadTree((15.86914545, 106.13803025), 12 , 2)

In [104]:
lat_list, lon_list = lat_lon_df['lat'].tolist(), lat_lon_df['lon'].tolist()
lat_lon_tuple_list = [(lat, lon) for lat, lon in zip(lat_list, lon_list)]
print(len(lat_lon_tuple_list))

3848


In [105]:
for lat_lon_tuple in lat_lon_tuple_list:
    tree.insert(lat_lon_tuple)

In [106]:
lat_lon_tuple in tree

True

In [107]:
found = tree.nearest_neighbors(lat_lon_tuple, count=4)
print(found[0].x, found[0].y)

10.7408331 106.593981


In [108]:
def get_address_by_lat_lon(lat, lon):
    item_df = df[(df['lat'] == lat) & (df['lon'] == lon)]
    return item_df['district'].mode().tolist()[0], item_df['street'].mode().tolist()[0], item_df['ward'].mode().tolist()[0]

In [109]:
def find_nearest_neighbors(lat, lon, count = 10):
    found = tree.nearest_neighbors((lat, lon), count=count)
    nearest_lat_lon_df = pd.DataFrame()
    nearest_lat_lon_df['lat'] = [item.x for item in found]
    nearest_lat_lon_df['lon'] = [item.y for item in found]

    nearest_lat_lon_df['district'], nearest_lat_lon_df['street'], nearest_lat_lon_df['ward'] = zip(*nearest_lat_lon_df.apply(lambda x: get_address_by_lat_lon(x['lat'], x['lon']), axis = 1))

    arr = np.concatenate(nearest_lat_lon_df.values)
    return arr

In [110]:
arrs = [find_nearest_neighbors(lat_lon_tuple[0], lat_lon_tuple[1]) for lat_lon_tuple in lat_lon_tuple_list]
nearest_df = pd.DataFrame(arrs)

In [111]:
nearest_df = pd.DataFrame(arrs)
nearest_df = nearest_df.rename(columns = {0: 'lat', 1: 'lon'})
del nearest_df[2],nearest_df[3], nearest_df[4]
cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon']]

lat_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 0]
format_lat_cols = [f'nearest_{i}_lat' for i, col in enumerate(lat_cols)]

lon_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 1]
format_lon_cols = [f'nearest_{i}_lon' for i, col in enumerate(lon_cols)]

district_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 2]
format_district_cols = [f'nearest_{i}_district' for i, col in enumerate(district_cols)]

street_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 3]
format_street_cols = [f'nearest_{i}_street' for i, col in enumerate(street_cols)]

ward_cols = [c for c in nearest_df.columns.tolist() if c not in ['lat', 'lon'] and c % 5 == 4]
format_ward_cols = [f'nearest_{i}_ward' for i, col in enumerate(ward_cols)]

source_cols_list = [lat_cols, lon_cols, district_cols, street_cols, ward_cols]
target_cols_list = [format_lat_cols, format_lon_cols, format_district_cols, format_street_cols, format_ward_cols]


for source_cols, target_cols in zip(source_cols_list, target_cols_list):
    rename_dict = dict(zip(source_cols, target_cols))
    nearest_df = nearest_df.rename(columns = rename_dict)
nearest_df

Unnamed: 0,lat,lon,nearest_0_lat,nearest_0_lon,nearest_0_district,nearest_0_street,nearest_0_ward,nearest_1_lat,nearest_1_lon,nearest_1_district,...,nearest_7_lat,nearest_7_lon,nearest_7_district,nearest_7_street,nearest_7_ward,nearest_8_lat,nearest_8_lon,nearest_8_district,nearest_8_street,nearest_8_ward
0,10.860736,106.739182,10.863722,106.742199,0.0,695.0,0.0,10.858158,106.743695,0.0,...,,,,,,,,,,
1,10.722855,106.714168,10.723600,106.712030,1.0,2611.0,1.0,10.720753,106.715350,1.0,...,10.726797,106.720103,1.0,1661.0,116.0,10.730112,106.708597,1.0,2039.0,1.0
2,21.008299,105.774104,21.010092,105.771774,2.0,68.0,2.0,21.007402,105.776933,2.0,...,21.014919,105.776764,2.0,1818.0,47.0,21.015825,105.775956,2.0,1808.0,47.0
3,20.998322,105.867558,20.998952,105.869286,3.0,51.0,46.0,21.001092,105.869070,3.0,...,,,,,,,,,,
4,21.357386,105.823221,21.319113,105.869016,4.0,1683.0,522.0,21.291596,105.862015,4.0,...,21.270999,105.743261,4.0,2008.0,428.0,21.238048,105.849162,4.0,1387.0,495.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3843,10.817858,106.770901,10.816681,106.770816,15.0,2703.0,400.0,10.819090,106.771320,15.0,...,10.816404,106.776621,15.0,2470.0,345.0,,,,,
3844,10.594044,106.728478,10.598338,106.731020,42.0,2519.0,375.0,10.598637,106.744754,42.0,...,10.645557,106.735894,42.0,3012.0,472.0,10.658523,106.727698,42.0,1259.0,472.0
3845,10.786360,106.652100,10.786960,106.651977,10.0,3100.0,547.0,10.787188,106.652776,10.0,...,10.783506,106.652954,10.0,2669.0,547.0,10.785526,106.655723,10.0,3190.0,137.0
3846,10.744984,106.705015,10.745459,106.705690,1.0,3461.0,203.0,10.744307,106.705649,1.0,...,10.741208,106.705579,1.0,1954.0,5.0,10.743491,106.708758,1.0,2499.0,5.0


In [112]:
df = df.merge(nearest_df, how='left', on = ['lat', 'lon'])
df

Unnamed: 0,numberOfFloors,numberOfBathRooms,numberOfBedRooms,certificateOfLandUseRight,ward,street,district,city,lat,lon,...,nearest_7_lat,nearest_7_lon,nearest_7_district,nearest_7_street,nearest_7_ward,nearest_8_lat,nearest_8_lon,nearest_8_district,nearest_8_street,nearest_8_ward
0,1,2.0,2.0,0,0,0,0,0,10.860736,106.739182,...,,,,,,,,,,
1,1,2.0,3.0,1,1,1,1,0,10.722855,106.714168,...,10.726797,106.720103,1.0,1661.0,116.0,10.730112,106.708597,1.0,2039.0,1.0
2,1,,,1,2,2,2,1,21.008299,105.774104,...,21.014919,105.776764,2.0,1818.0,47.0,21.015825,105.775956,2.0,1808.0,47.0
3,4,,,0,3,3,3,1,20.998322,105.867558,...,,,,,,,,,,
4,1,,,1,4,4,4,1,21.357386,105.823221,...,21.270999,105.743261,4.0,2008.0,428.0,21.238048,105.849162,4.0,1387.0,495.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170959,5,3.0,4.0,1,39,240,5,1,21.023187,105.794927,...,21.021594,105.800943,7.0,953.0,135.0,21.025196,105.788956,5.0,674.0,39.0
170960,1,1.0,1.0,0,44,411,5,1,21.008718,105.797279,...,21.006424,105.802052,5.0,2200.0,44.0,21.014320,105.796837,5.0,3509.0,44.0
170961,5,4.0,4.0,1,35,39,7,1,21.023378,105.832225,...,21.026245,105.833540,7.0,1207.0,335.0,21.025741,105.834756,7.0,3262.0,335.0
170962,1,,,0,45,50,19,1,21.089607,105.812688,...,21.077225,105.817190,19.0,584.0,59.0,21.073761,105.812858,19.0,128.0,45.0


In [113]:
df

Unnamed: 0,numberOfFloors,numberOfBathRooms,numberOfBedRooms,certificateOfLandUseRight,ward,street,district,city,lat,lon,...,nearest_7_lat,nearest_7_lon,nearest_7_district,nearest_7_street,nearest_7_ward,nearest_8_lat,nearest_8_lon,nearest_8_district,nearest_8_street,nearest_8_ward
0,1,2.0,2.0,0,0,0,0,0,10.860736,106.739182,...,,,,,,,,,,
1,1,2.0,3.0,1,1,1,1,0,10.722855,106.714168,...,10.726797,106.720103,1.0,1661.0,116.0,10.730112,106.708597,1.0,2039.0,1.0
2,1,,,1,2,2,2,1,21.008299,105.774104,...,21.014919,105.776764,2.0,1818.0,47.0,21.015825,105.775956,2.0,1808.0,47.0
3,4,,,0,3,3,3,1,20.998322,105.867558,...,,,,,,,,,,
4,1,,,1,4,4,4,1,21.357386,105.823221,...,21.270999,105.743261,4.0,2008.0,428.0,21.238048,105.849162,4.0,1387.0,495.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170959,5,3.0,4.0,1,39,240,5,1,21.023187,105.794927,...,21.021594,105.800943,7.0,953.0,135.0,21.025196,105.788956,5.0,674.0,39.0
170960,1,1.0,1.0,0,44,411,5,1,21.008718,105.797279,...,21.006424,105.802052,5.0,2200.0,44.0,21.014320,105.796837,5.0,3509.0,44.0
170961,5,4.0,4.0,1,35,39,7,1,21.023378,105.832225,...,21.026245,105.833540,7.0,1207.0,335.0,21.025741,105.834756,7.0,3262.0,335.0
170962,1,,,0,45,50,19,1,21.089607,105.812688,...,21.077225,105.817190,19.0,584.0,59.0,21.073761,105.812858,19.0,128.0,45.0


In [114]:
import math
from tqdm import tqdm
def distance_func(lat1: float, lon1: float, lat2: float, lon2: float):

    try:
        R = 6371
        dLat = (lat2-lat1) * math.pi / 180
        dLon = (lon2-lon1) * math.pi / 180
        lat1 = lat1 * math.pi / 180
        lat2 = lat2 * math.pi / 180
        a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * \
            math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        d = R * c
        return d*1000
    except:
        print(f"Distance_Func error with: {(lat1, lon1, lat2, lon2)}")


for i in tqdm(range(9)):
    df[f'distance_nearest_{i}'] = df.apply(lambda x: distance_func(x['lat'], x['lon'], x[f'nearest_{i}_lat'], x[f'nearest_{i}_lon']), axis = 1)
    df[f'distance_nearest_{i}'] = np.log(df[f'distance_nearest_{i}'])

100%|██████████| 9/9 [00:38<00:00,  4.28s/it]


In [115]:
df

Unnamed: 0,numberOfFloors,numberOfBathRooms,numberOfBedRooms,certificateOfLandUseRight,ward,street,district,city,lat,lon,...,nearest_8_ward,distance_nearest_0,distance_nearest_1,distance_nearest_2,distance_nearest_3,distance_nearest_4,distance_nearest_5,distance_nearest_6,distance_nearest_7,distance_nearest_8
0,1,2.0,2.0,0,0,0,0,0,10.860736,106.739182,...,,6.147933,6.345885,6.382910,6.400470,,,,,
1,1,2.0,3.0,1,1,1,1,0,10.722855,106.714168,...,1.0,5.512971,5.587561,6.214932,6.245667,6.319727,6.399320,6.635897,6.662696,6.918456
2,1,,,1,2,2,2,1,21.008299,105.774104,...,47.0,5.747616,5.736936,5.828142,6.225193,6.312889,6.303798,6.431241,6.667267,6.755361
3,4,,,0,3,3,3,1,20.998322,105.867558,...,,5.260499,5.845523,6.165166,6.420461,6.532105,6.563708,,,
4,1,,,1,4,4,4,1,21.357386,105.823221,...,495.0,8.759742,9.029592,9.096917,9.258116,9.341013,9.313155,9.459438,9.448080,9.513343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170959,5,3.0,4.0,1,39,240,5,1,21.023187,105.794927,...,39.0,5.270781,5.628583,5.719904,6.223957,6.229783,6.290400,6.509675,6.475533,6.490368
170960,1,1.0,1.0,0,44,411,5,1,21.008718,105.797279,...,44.0,5.595166,5.784801,5.938606,6.257163,6.286417,6.302558,6.355579,6.323106,6.436978
170961,5,4.0,4.0,1,35,39,7,1,21.023378,105.832225,...,335.0,5.004280,5.223238,5.288040,5.398323,5.517029,5.569863,5.857783,5.848703,5.917647
170962,1,,,0,45,50,19,1,21.089607,105.812688,...,45.0,5.302945,5.810216,5.825835,6.128071,6.134124,6.304772,7.162206,7.281993,7.474252


In [116]:
df.to_csv('../data/process_v1/process_data_6.csv', index = False)

In [119]:
df['time']

0         2023-12-27T00:00:00
1         2023-12-25T00:00:00
2         2023-11-06T00:00:00
3         2023-10-22T00:00:00
4         2023-10-28T00:00:00
                 ...         
170959    2023-11-08T00:00:00
170960    2023-12-19T00:00:00
170961    2024-01-03T00:00:00
170962    2023-11-13T00:00:00
170963    2023-12-04T00:00:00
Name: time, Length: 170964, dtype: object