In [14]:
from utilities.download import download_and_read_json_xz

# Example usage
url = "http://dadosabertos.c3sl.ufpr.br/curitibaurbs/2024_06_23_shapeLinha.json.xz"
df = download_and_read_json_xz(url)

In [15]:
from utilities.coordinates import format_coord

df['coordinates'] = df.apply(lambda row: format_coord(row['LAT'], row['LON']), axis=1)
df.drop(['LAT', 'LON'], axis=1, inplace=True)

In [16]:
df

Unnamed: 0,SHP,COD,DATE,coordinates
0,2612,602,2024-06-23,"(-49.29564508098565, -25.51231568263223)"
1,2612,602,2024-06-23,"(-49.29563592791851, -25.51259077193037)"
2,2612,602,2024-06-23,"(-49.29563168666107, -25.512727106234163)"
3,2612,602,2024-06-23,"(-49.29562744540368, -25.512859204361412)"
4,2612,602,2024-06-23,"(-49.2956182923366, -25.513149725321043)"
...,...,...,...,...
244192,4734,229,2024-06-23,"(-49.242376168208494, -25.392743170666368)"
244193,4734,229,2024-06-23,"(-49.241878021929125, -25.3931414256338)"
244194,4734,229,2024-06-23,"(-49.24162961934169, -25.39334267329631)"
244195,4734,229,2024-06-23,"(-49.241500388906076, -25.39344208559015)"


In [17]:
import pandas as pd
from geopy.distance import geodesic

def calculate_distances(group):
    group = group.reset_index()

    previous_shifted = group['coordinates'].shift(1)
    next_shifted = group['coordinates'].shift(-1)

    group['distance_to_prev'] = [geodesic(c1, c2).m if not pd.isna(c2) else 0 for c1, c2 in zip(group['coordinates'], previous_shifted)]

    group['distance_to_next'] = [geodesic(c1, c2).m if not pd.isna(c2) else 0 for c1, c2 in zip(group['coordinates'], next_shifted)]

    group = group.drop(columns='index')
    return group

# Group by SHP, COD, and DATE, then apply the distance calculation
df = df.groupby(['SHP', 'COD', 'DATE'], group_keys=False).apply(calculate_distances)

  df = df.groupby(['SHP', 'COD', 'DATE'], group_keys=False).apply(calculate_distances)


In [18]:
mean_distance_prev = df['distance_to_prev'].mean()
std_distance_prev = df['distance_to_prev'].std()
mean_distance_next = df['distance_to_next'].mean()
std_distance_next = df['distance_to_next'].std()


threshold_prev = mean_distance_prev + 3 * std_distance_prev
threshold_next = mean_distance_next + 3 * std_distance_next

df_filtered = df[(df['distance_to_prev'] <= threshold_prev) & (df['distance_to_next'] <= threshold_next)]


In [19]:
df_filtered = df_filtered.groupby(['SHP', 'COD', 'DATE'], group_keys=False).apply(calculate_distances)

df_filtered

  df_filtered = df_filtered.groupby(['SHP', 'COD', 'DATE'], group_keys=False).apply(calculate_distances)


Unnamed: 0,SHP,COD,DATE,coordinates,distance_to_prev,distance_to_next
0,1708,010,2024-06-23,"(-49.268802216214, -25.416432732579345)",0.000000,22.161966
1,1708,010,2024-06-23,"(-49.26860454212067, -25.41647121277954)",22.161966,20.191416
2,1708,010,2024-06-23,"(-49.26842497293825, -25.416512115585526)",20.191416,4.428712
3,1708,010,2024-06-23,"(-49.26842097976231, -25.416572661387978)",4.428712,5.292711
4,1708,010,2024-06-23,"(-49.26838868014249, -25.416626070744645)",5.292711,4.754216
...,...,...,...,...,...,...
149,5144,177,2024-06-23,"(-49.25274801480769, -25.406506896676614)",23.912895,56.657972
150,5144,177,2024-06-23,"(-49.25315945450075, -25.406965840924453)",56.657972,13.194205
151,5144,177,2024-06-23,"(-49.25310495045443, -25.40712682058987)",13.194205,18.225529
152,5144,177,2024-06-23,"(-49.252944017913535, -25.407174065003964)",18.225529,35.876812


In [20]:
df_filtered.drop(['distance_to_prev', 'distance_to_next'], axis=1, inplace=True)

In [21]:
from utilities.elastic import get_elastic_client

es = get_elastic_client()

In [22]:
# join the cordinates into a list
df_filtered = df_filtered.groupby(['SHP', 'COD', 'DATE'], group_keys=False).agg({'coordinates': lambda x: list(x)})

In [23]:
df_filtered['coordinates'] = df_filtered['coordinates'].apply(lambda x: {'type': 'LineString', 'coordinates': x})


In [41]:
df_filtered['CODIGOLINHA'] = df_filtered['COD']
df_filtered.drop(['COD'], axis=1, inplace=True)

In [43]:
index_name = 'shape_linha_without_date'

index_settings = {
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "SHP": {
        "type": "integer" 
      },
      "CODIGOLINHA": { 
        "type": "keyword" 
      },
      "DATE": { 
        "type": "date", 
        "format": "yyyy-MM-dd"
      },
      "coordinates": { 
        "type": "geo_shape"
      }
    }
  }
}

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_settings)

In [44]:
df_filtered

Unnamed: 0,SHP,DATE,coordinates,CODIGOLINHA
0,1708,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.26...",010
1,1709,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.26...",011
2,1713,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.25...",023
3,1714,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.29...",024
4,1717,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.33...",040
...,...,...,...,...
673,5137,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.27...",183
674,5139,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.28...",184
675,5142,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.28...",176
676,5143,2024-06-23,"{'type': 'LineString', 'coordinates': [(-49.27...",176


In [45]:
from utilities.elastic import bulk_insert

bulk_insert(es, df_filtered, index_name)

[(True, {'index': {'_index': 'shape_linha_without_date', '_id': 'JJyrRpAB64SVXitPf6tV', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}), (True, {'index': {'_index': 'shape_linha_without_date', '_id': 'JZyrRpAB64SVXitPf6tV', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}), (True, {'index': {'_index': 'shape_linha_without_date', '_id': 'JpyrRpAB64SVXitPf6tV', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}), (True, {'index': {'_index': 'shape_linha_without_date', '_id': 'J5yrRpAB64SVXitPf6tV', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}), (True, {'index': {'_index': 'shape_linha_without_date', '_id': 'KJyrRpAB64SVXit