In [1]:
from pathlib import Path
import pandas as pd
import time
import geopandas as gpd
from shapely import geometry
import dask.dataframe as dd
from dask import delayed
import swifter
import json
import msgspec

In [2]:
whole_path = Path('/nfs/projects/overwatch/maxar-segmentation/microsoft-roads/AfricaWest-Full.tsv')

In [81]:
t0 = time.time()
step1 = json.loads(region_road_df['geometry'].iloc[0])['geometry']
t1 = time.time()
res = geometry.shape(step1)
t2 = time.time()
print(f'json load: {t1 - t0}, shape: {t2 - t1}')



json load: 0.0014243125915527344, shape: 0.0004885196685791016


In [90]:
import msgspec
t0 = time.time()
step1 = msgspec.json.decode(region_road_df['geometry'].iloc[0])['geometry']
t1 = time.time()
res = geometry.shape(step1)
t2 = time.time()
print(f'json load: {t1 - t0}, shape: {t2 - t1}')

json load: 0.0003714561462402344, shape: 0.0017671585083007812


In [6]:
start_t = time.time()
t0_i= time.time()
region_road_df = pd.read_csv(whole_path, names =['country', 'geometry'], sep='\t')
t0_f = time.time()
print(f'pd.read_csv time: {t0_f - t0_i}')
t1_i = time.time()
region_road_df['geometry'] = region_road_df['geometry'].apply(custom_json_loads)
t1_f = time.time()
print(f'apply time: {t1_f - t1_i}')
t2_i = time.time()
region_road_gdf = gpd.GeoDataFrame(region_road_df, crs=4326)
t2_f = time.time()
print(f'GeoDataFrame time: {t2_f - t2_i}')
end_t = time.time()
print(f'Total time: {end_t - start_t}')


pd.read_csv time: 9.33686375617981
apply time: 65.0588927268982
GeoDataFrame time: 10.049694538116455
Total time: 84.44595956802368


# BEST CHECK IF SAME RESULTS

In [5]:
import ujson
def custom_json_loads(s):
    try:
        return geometry.shape(ujson.loads(s)['geometry'])
    except:
        return geometry.LineString()

In [8]:
start_t = time.time()
t0_i= time.time()
region_road_df = pd.read_csv(whole_path, names =['geometry'], sep='\t', usecols=['geometry'], converters={'geometry': custom_json_loads})
t0_f = time.time()
print(f'pd.read_csv time: {t0_f - t0_i}')
t2_i = time.time()
region_road_gdf = gpd.GeoDataFrame(region_road_df, crs=4326)
t2_f = time.time()
print(f'GeoDataFrame time: {t2_f - t2_i}')
end_t = time.time()
print(f'Total time: {end_t - start_t}')

pd.read_csv time: 38.029499530792236
GeoDataFrame time: 9.052536487579346
Total time: 47.082475900650024


In [22]:
start_t = time.time()
t0_i= time.time()
region_road_df = pd.read_csv(whole_path, names =['country', 'geometry'], sep='\t')
t0_f = time.time()
print(f'pd.read_csv time: {t0_f - t0_i}')
t1_i = time.time()
region_road_df['geometry'] = region_road_df['geometry'].swifter.apply(custom_json_loads)
t1_f = time.time()
print(f'apply time: {t1_f - t1_i}')
t2_i = time.time()
region_road_gdf = gpd.GeoDataFrame(region_road_df, crs=4326)
t2_f = time.time()
print(f'GeoDataFrame time: {t2_f - t2_i}')
end_t = time.time()
print(f'Total time: {end_t - start_t}')

pd.read_csv time: 8.837228059768677


Pandas Apply:   0%|          | 0/4438294 [00:00<?, ?it/s]

apply time: 37.68620038032532
GeoDataFrame time: 9.239280462265015
Total time: 55.76539921760559


In [None]:
start_t = time.time()
chunksieze = 100_000
region_road_df = pd.read_csv(whole_path, names = ['country', 'geometry'], sep='\t', converters={'geometry': custom_json_loads} )
#region_road_df['geometry'] = region_road_df['geometry'].apply(custom_json_loads)
region_road_gdf = gpd.GeoDataFrame(region_road_df, crs=4326)
end_t = time.time()
print(f'Loading time: {end_t - start_t}')

Using chunks

In [12]:
start_t = time.time()
chunk_size = 1000  # Adjust the chunk size based on available memory
chunks = []
for chunk in pd.read_csv(whole_path, names=['country', 'geometry'], sep='\t', usecols=[0, 1], dtype={'country': str, 'geometry': str}, chunksize=chunk_size):
    chunk['geometry'] = chunk['geometry'].apply(custom_json_loads)
    chunks.append(chunk)
fast_region_road_df = pd.concat(chunks, ignore_index=True)
fast_region_road_gdf = gpd.GeoDataFrame(region_road_df, crs=4326)
end_t = time.time()
print(f'Loading time: {end_t - start_t}')

Loading time: 52.00859093666077


Using Converter

In [16]:
start_t = time.time()
chunk_size = 10000  # Adjust the chunk size based on available memory
chunks = []
for chunk in pd.read_csv(whole_path, names=['country', 'geometry'], sep='\t', usecols=[0, 1], dtype={'country': str}, converters={'geometry': custom_json_loads}, chunksize=chunk_size):
    chunks.append(chunk)
fast_region_road_df = pd.concat(chunks, ignore_index=True)
fast_region_road_gdf = gpd.GeoDataFrame(fast_region_road_df, crs=4326)
end_t = time.time()
print(f'Loading time: {end_t - start_t}')

Loading time: 44.09066653251648
