In [1]:
from utilities.download import download_and_read_json_xz

# Example usage
url = "http://dadosabertos.c3sl.ufpr.br/curitibaurbs/2023_11_25_shapeLinha.json.xz"
df = download_and_read_json_xz(url)

In [2]:
from utilities.coordinates import format_coord

df['coordinates'] = df.apply(lambda row: format_coord(row['LAT'], row['LON']), axis=1)
df.drop(['LAT', 'LON'], axis=1, inplace=True)

In [3]:
df

Unnamed: 0,SHP,COD,DATE,coordinates
0,2612,602,2023-11-25,"(-49.29564508098565, -25.51231568263223)"
1,2612,602,2023-11-25,"(-49.29563592791851, -25.51259077193037)"
2,2612,602,2023-11-25,"(-49.29563168666107, -25.512727106234163)"
3,2612,602,2023-11-25,"(-49.29562744540368, -25.512859204361412)"
4,2612,602,2023-11-25,"(-49.2956182923366, -25.513149725321043)"
...,...,...,...,...
248791,4720,X50,2023-11-25,"(-49.30822725263312, -25.436443131455214)"
248792,4720,X50,2023-11-25,"(-49.30809518523198, -25.436366860559378)"
248793,4720,X50,2023-11-25,"(-49.3080087153841, -25.4362566788145)"
248794,4720,X50,2023-11-25,"(-49.307813584678264, -25.436163423917385)"


In [4]:
import pandas as pd
from geopy.distance import geodesic

def calculate_distances(group):
    group = group.reset_index()

    previous_shifted = group['coordinates'].shift(1)
    next_shifted = group['coordinates'].shift(-1)

    group['distance_to_prev'] = [geodesic(c1, c2).m if not pd.isna(c2) else 0 for c1, c2 in zip(group['coordinates'], previous_shifted)]

    group['distance_to_next'] = [geodesic(c1, c2).m if not pd.isna(c2) else 0 for c1, c2 in zip(group['coordinates'], next_shifted)]

    group = group.drop(columns='index')
    return group

# Group by SHP, COD, and DATE, then apply the distance calculation
df = df.groupby(['SHP', 'COD', 'DATE'], group_keys=False).apply(calculate_distances)

In [18]:
mean_distance_prev = df['distance_to_prev'].mean()
std_distance_prev = df['distance_to_prev'].std()
mean_distance_next = df['distance_to_next'].mean()
std_distance_next = df['distance_to_next'].std()


threshold_prev = mean_distance_prev + 3 * std_distance_prev
threshold_next = mean_distance_next + 3 * std_distance_next

df_filtered = df[(df['distance_to_prev'] <= threshold_prev) & (df['distance_to_next'] <= threshold_next)]


In [19]:
df_filtered = df_filtered.groupby(['SHP', 'COD', 'DATE'], group_keys=False).apply(calculate_distances)

df_filtered

Unnamed: 0,SHP,COD,DATE,coordinates,distance_to_prev,distance_to_next
0,1708,010,2023-11-25,"(-49.268802216214, -25.416432732579345)",0.000000,22.161966
1,1708,010,2023-11-25,"(-49.26860454212067, -25.41647121277954)",22.161966,20.191416
2,1708,010,2023-11-25,"(-49.26842497293825, -25.416512115585526)",20.191416,4.428712
3,1708,010,2023-11-25,"(-49.26842097976231, -25.416572661387978)",4.428712,5.292711
4,1708,010,2023-11-25,"(-49.26838868014249, -25.416626070744645)",5.292711,4.754216
...,...,...,...,...,...,...
148,5071,615,2023-11-25,"(-49.2922926532725, -25.476659902061197)",4.557484,7.369490
149,5071,615,2023-11-25,"(-49.292231020343735, -25.476622696473505)",7.369490,2.139475
150,5071,615,2023-11-25,"(-49.29221330858759, -25.476611218189845)",2.139475,12.905262
151,5071,615,2023-11-25,"(-49.29225817089336, -25.476447603807276)",12.905262,12.908449


In [26]:
df_filtered.drop(['distance_to_prev', 'distance_to_next'], axis=1, inplace=True)

In [23]:
from utilities.elastic import get_elastic_client

es = get_elastic_client()

  _transport = transport_class(


In [32]:
# join the cordinates into a list
df_filtered = df_filtered.groupby(['SHP', 'COD', 'DATE'], group_keys=False).agg({'coordinates': lambda x: list(x)})

In [33]:
df_filtered['coordinates'] = df_filtered['coordinates'].apply(lambda x: {'type': 'LineString', 'coordinates': x})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,coordinates
SHP,COD,DATE,Unnamed: 3_level_1
1708,010,2023-11-25,"[(-49.268802216214, -25.416432732579345), (-49..."
1709,011,2023-11-25,"[(-49.269052324946884, -25.416382197583655), (..."
1713,023,2023-11-25,"[(-49.253077237434354, -25.406438820371367), (..."
1714,024,2023-11-25,"[(-49.29302796932893, -25.491786543041144), (-..."
1717,040,2023-11-25,"[(-49.330736871146605, -25.400520726342098), (..."
...,...,...,...
5064,779,2023-11-25,"[(-49.302343526880684, -25.45911587157857), (-..."
5067,779,2023-11-25,"[(-49.30234013848576, -25.459102632894158), (-..."
5068,722,2023-11-25,"[(-49.35446317075619, -25.47108871712033), (-4..."
5069,722,2023-11-25,"[(-49.35017499847165, -25.48378150344178), (-4..."


In [34]:
index_name = 'shape_linha'

index_settings = {
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "SHP": {
        "type": "integer" 
      },
      "COD": { 
        "type": "keyword" 
      },
      "DATE": { 
        "type": "date", 
        "format": "yyyy-MM-dd"
      },
      "coordinates": { 
        "type": "geo_shape"
      }
    }
  }
}

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_settings)



In [35]:
from utilities.elastic import bulk_insert

bulk_insert(es, df_filtered, index_name)



BulkIndexError: 500 document(s) failed to index.