## Focus on geolocations in specific cities

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-seg-net

D:\mobi-seg-net


In [2]:
import pandas as pd
import numpy as np
import os
os.environ['JAVA_HOME'] = "C:/Java/jdk-1.8"
from tqdm import tqdm
import sqlalchemy
from lib import workers as workers
import matplotlib.pyplot as plt

In [4]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [3]:
def within_box(lat, lng):
    if (lat >= workers.stockholm_box[1]) & (lat <= workers.stockholm_box[3]):
        if (lng >= workers.stockholm_box[0]) & (lng <= workers.stockholm_box[2]):
            return 1
    return 0

In [8]:
df_stops_list = []
for i in tqdm(range(0, 50), desc="Filtering data"):
    df_stops = pd.read_parquet(f"dbs/stops_pr/stops_pr_{i}.parquet", columns=['device_aid', 'h3_id', 'home', 'kind', 'latitude', 'longitude'])
    df_stops = df_stops[df_stops['home']!=1]
    df_stops.loc[:, 'Stockholm'] = df_stops.apply(lambda row: within_box(row['latitude'], row['longitude']), axis=1)
    df_stops = df_stops.loc[df_stops['Stockholm'] == 1, :].drop(columns=['Stockholm', 'latitude', 'longitude'])
    df_stops_list.append(df_stops)

Filtering data: 100%|██████████| 50/50 [11:28<00:00, 13.77s/it]


In [9]:
df_stops = pd.concat(df_stops_list)
df_stops.drop(columns=['home'], inplace=True)
print(len(df_stops), df_stops.device_aid.nunique())

24609483 941746


In [13]:
df_stops.to_parquet('dbs/cities/stockholm.parquet', index=False)

In [4]:
df = pd.read_parquet('dbs/cities/stockholm.parquet')
df.head()

Unnamed: 0,device_aid,h3_id,kind
0,00021c1c-fe37-4855-93f5-aba1b4b9ff56,8808866e53fffff,"[Retail, dining, and lifestyle, Community and ..."
1,00021c1c-fe37-4855-93f5-aba1b4b9ff56,8808866e53fffff,"[Community and recreational spaces, Retail, di..."
2,00021c1c-fe37-4855-93f5-aba1b4b9ff56,8808867547fffff,"[Business and industrial services, Health and ..."
3,00021c1c-fe37-4855-93f5-aba1b4b9ff56,8808866e53fffff,"[Retail, dining, and lifestyle, Community and ..."
4,00021c1c-fe37-4855-93f5-aba1b4b9ff56,8808866e57fffff,"[Community and recreational spaces, Health and..."


In [5]:
df = pd.read_parquet(f"dbs/stops_pr/stops_pr_{2}.parquet")
df.head()

Unnamed: 0,device_aid,h3_id,loc,latitude,longitude,size,batch,dur,localtime,l_localtime,date,home,h_s,year,weekday,week,seq,osm_id,primary,kind
4786732,000272d8-bdfe-4380-9b00-3ce7d184fb67,88088675a5fffff,10,59.406967,17.956165,4,2,393.083333,2024-02-02 05:33:54+01:00,2024-02-02 12:06:59+01:00,2024-02-02,0.0,5,2024,4,5,1,"[08f088675a504a5203227c38536a551b, 08f088675a5...","[indian_restaurant, industrial_equipment, fair...","[Retail, dining, and lifestyle, Business and i..."
1857569,000272d8-bdfe-4380-9b00-3ce7d184fb67,880813270dfffff,29,63.1833,14.65,7,2,187.25,2024-02-19 02:33:48+01:00,2024-02-19 05:41:03+01:00,2024-02-19,0.0,2,2024,0,8,2,"[08f0813270ccd49803203f47c33d695d, 08f0813270c...","[environmental_conservation_organization, inst...","[Community and recreational spaces, Business a..."
4787069,000272d8-bdfe-4380-9b00-3ce7d184fb67,880813270dfffff,29,63.1833,14.65,55,2,243.7,2024-02-19 09:33:09+01:00,2024-02-19 13:36:51+01:00,2024-02-19,0.0,9,2024,0,8,3,"[08f0813270ccd49803203f47c33d695d, 08f0813270c...","[environmental_conservation_organization, inst...","[Community and recreational spaces, Business a..."
2506265,000272d8-bdfe-4380-9b00-3ce7d184fb67,8808862935fffff,11,59.5,18.05,17,2,193.4,2024-02-26 08:03:30+01:00,2024-02-26 11:16:54+01:00,2024-02-26,0.0,8,2024,0,9,4,"[08f08862934dc2b203fdfb1e8d4d0c2f, 08f08862934...","[professional_services, beauty_and_spa, busine...","[Business and industrial services, Health and ..."
3482883,000272d8-bdfe-4380-9b00-3ce7d184fb67,8808855733fffff,1,60.661277,17.213521,2,2,97.55,2024-10-29 08:42:30+01:00,2024-10-29 10:20:03+01:00,2024-10-29,1.0,8,2024,1,44,5,"[08f08855732048b3039bd0d9dc347252, 08f08855733...","[accommodation, professional_services, accommo...","[Community and recreational spaces, Business a..."


In [6]:
df.columns

Index(['device_aid', 'h3_id', 'loc', 'latitude', 'longitude', 'size', 'batch',
       'dur', 'localtime', 'l_localtime', 'date', 'home', 'h_s', 'year',
       'weekday', 'week', 'seq', 'osm_id', 'primary', 'kind'],
      dtype='object')

## Preserve stops and time

In [None]:
df_stops_list = []
cols = ['device_aid', 'h3_id', 'loc', 'latitude', 'longitude',
       'dur', 'localtime', 'l_localtime', 'date', 'home', 'year',
       'weekday', 'week', 'seq']
for i in tqdm(range(0, 50), desc="Filtering data"):
    df_stops = pd.read_parquet(f"dbs/stops_pr/stops_pr_{i}.parquet",
                               columns=cols)
    df_stops.loc[:, 'Stockholm'] = df_stops.apply(lambda row: within_box(row['latitude'], row['longitude']), axis=1)
    df_stops = df_stops.loc[df_stops['Stockholm'] == 1, :].drop(columns=['Stockholm'])
    df_stops_list.append(df_stops)
df_stops = pd.concat(df_stops_list)

In [5]:
# Assuming your dataframe is called df
# Get the unique device_aid values:
unique_ids = df_stops['device_aid'].unique()

# (Optionally) Shuffle the unique IDs if you want random batches:
np.random.shuffle(unique_ids)

# Split the unique device IDs into 8 batches:
id_batches = np.array_split(unique_ids, 8)

# Create a dictionary mapping each device_id to its batch number (1-8):
batch_dict = {}
for batch_no, batch_ids in enumerate(id_batches, start=1):
    for device in batch_ids:
        batch_dict[device] = batch_no

# Map the batch to the original dataframe:
df_stops['batch'] = df_stops['device_aid'].map(batch_dict)

In [6]:
print(len(df_stops), df_stops.device_aid.nunique())
df_stops.groupby('batch').apply(lambda x: x.to_parquet(f'dbs/cities/stockholm_stops_{x.name}.parquet', index=False))

39953264 943616


  df_stops.groupby('batch').apply(lambda x: x.to_parquet(f'dbs/cities/stockholm_stops_{x.name}.parquet', index=False))


## Individual attributes

In [11]:
df = pd.read_parquet('dbs/cities/stockholm_stops.parquet', columns=['device_aid', 'home', 'latitude', 'longitude'])
df = df[df['home'] == 1]
df.drop_duplicates(subset=['device_aid'], inplace=True)
df = pd.merge(df,
              pd.read_sql("""SELECT device_aid, b_id FROM home_building;""", con=engine), on='device_aid', how='left')
df = pd.merge(df,
              pd.read_sql("""SELECT * FROM building_data;""", con=engine), on='b_id', how='left')

In [14]:
df.drop(columns=['home', 'b_id']).to_parquet('dbs/cities/stockholm_individuals.parquet', index=False)