In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, MinuteLocator, SecondLocator
import numpy as np
import datetime
import os
from omegaconf import OmegaConf, DictConfig, ListConfig

import geopandas as gpd
from shapely.geometry import Point

from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS

from sklearn.metrics.pairwise import haversine_distances

from geopy.distance import great_circle

from shapely.geometry import MultiPoint

from tqdm import tqdm


from hydra import initialize, compose
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="..\\scripts\\config\\"):
    cfg = compose(config_name='config.yaml')

In [3]:
def get_slot(x, slot_len):
    hm = pd.to_datetime(x, unit='s').dt.strftime('%H:%M').str.split(':')
    return hm.apply(lambda x: (60 * int(x[0]) + int(x[1])) // slot_len)


launch_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
rng = np.random.default_rng(seed=cfg.seed)

if isinstance(cfg.id_points_range, ListConfig):
    bounds = OmegaConf.to_container(cfg.id_points_range)
else:
    if cfg.id_points_range >= 0:
        bounds = [0, cfg.id_points_range]
    else:
        bounds = [0, np.inf]

locs = pd.read_csv(rf"C:\Users\kirin\jupyter\CityTraffic\{cfg.locs_path}")

if cfg.n_ids == -1:
    ID_LIST = pd.unique(locs['id'])
elif isinstance(cfg.n_ids, ListConfig):
    ID_LIST = OmegaConf.to_container(cfg.n_ids)
else:
    ID_LIST = rng.choice(pd.unique(locs['id']), size=cfg.n_ids)


In [5]:
new_id_df = pd.DataFrame(columns=locs.columns)
for cur_id in tqdm(ID_LIST):
        cur_id_df = locs.query('id == @cur_id').copy()
        if cur_id_df['cnt'].sum() > bounds[1] or cur_id_df['cnt'].sum() < bounds[0]:
            continue
        cur_id_df['length'] = cur_id_df['last_ts'] - cur_id_df['first_ts']
        #     print(cur_id_df['length'].sum())
        days = cur_id_df.groupby('log_date')['length'].sum()
        days = days[days >= cfg.min_time_day_thresh]
        if days.shape[0] >= cfg.min_days_thresh:
            filtered_data = cur_id_df[cur_id_df['log_date'].isin(days.index)]
            new_id_df = pd.concat([new_id_df, filtered_data])

100%|██████████| 1949/1949 [03:59<00:00,  8.14it/s]


In [5]:
new_id_df.to_csv('../data/cleaned_locs.csv')

In [16]:
new_id_df['id'].unique().shape

(1949,)

HW CLASSIFICATION

In [4]:
launch_time = "2023-05-12_01-05-42"
cluster_data = pd.read_csv(f"C:\\Users\\kirin\\jupyter\\CityTraffic\\data\\{launch_time}\\clusters_{launch_time}.csv").drop(columns='home_place')
df = pd.read_csv(f"C:\\Users\\kirin\\jupyter\\CityTraffic\\data\\{launch_time}\\data_with_clusters_{launch_time}.csv", index_col=0)
pd.reset_option('display.max_rows')

In [5]:
cluster_data['work_place'] = None

In [7]:
df

Unnamed: 0.1,Unnamed: 0,lat,lon,is_weekday,hour,cnt,first_ts,last_ts,log_date,source,dt,id,length
0,3,54.8435,38.1928,True,6,2,1676951940,1676951941,2023-02-21,2,2023-02-21,2,1.0
1,4,54.8435,38.1928,True,22,3,1674846483,1674849498,2023-01-27,1,2023-02-02,2,3015.0
2,25954,54.8436,38.1932,True,0,2,1677533114,1677533114,2023-02-28,2,2023-02-28,2,0.0
3,34400,54.8435,38.1928,True,14,1,1675424706,1675424706,2023-02-03,2,2023-02-03,2,0.0
4,34401,54.8435,38.1929,False,12,2,1677402452,1677402453,2023-02-26,2,2023-02-26,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196009,9424012,54.8855,38.0870,False,7,14,1675483287,1675485957,2023-02-04,2,2023-02-04,52006,2670.0
2196010,9523756,54.8855,38.0870,False,18,53,1675524275,1675526138,2023-02-04,2,2023-02-04,52006,1863.0
2196011,9716391,54.8855,38.0870,True,18,1,1675955544,1675955544,2023-02-09,2,2023-02-09,52006,0.0
2196012,10278311,54.8855,38.0870,False,15,2,1675512276,1675512276,2023-02-04,2,2023-02-04,52006,0.0


In [6]:
#only weekday
def sec_from_day_start(x):
    hm = pd.to_datetime(x, unit='s').dt.strftime('%H:%M:%S').str.split(':')
    return hm.apply(lambda x: 3600 * int(x[0]) + 60 * int(x[1]) + int(x[0]))

THRESH_VALUE = 31000

for cur_id in tqdm(df['id'].unique()):
    cur_id_df = df.query('id == @cur_id')
    data = cur_id_df[(cur_id_df['cluster'] != -1) & (pd.to_datetime(cur_id_df['ts'], unit='s').dt.dayofweek < 5)].copy()
    data['day_ts'] = cur_id_df['ts'] % (60 * 60 * 24)
    gd = data.groupby('cluster')
    inq = gd[['day_ts']].agg(lambda x: np.quantile(x, 0.75)) - gd[['day_ts']].agg(lambda x: np.quantile(x, 0.25))
    inq = (inq < THRESH_VALUE)['day_ts'].rename('work_place').reset_index()
    cluster_data.loc[cluster_data['id'] == cur_id] = cluster_data.loc[cluster_data['id'] == cur_id].drop(columns='work_place').merge(inq, on='cluster', how='left').values
cluster_data.dropna(inplace=True)

  0%|          | 0/4158 [00:00<?, ?it/s]


KeyError: 'cluster'

In [14]:
cluster_data['work_place'].value_counts()

True     1485
False    1049
Name: work_place, dtype: int64

In [13]:
geometry = [Point(xy) for xy in zip(cluster_data['lon'], cluster_data['lat'])]
gdf = gpd.GeoDataFrame(cluster_data , geometry=geometry, crs=4326)
gdf.explore(categorical=True, column='work_place', cmap=['blue', 'green'])