In [67]:
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import folium
import numpy as np
from shapely.wkt import loads
from shapely.geometry import Point, Polygon
from sqlalchemy import create_engine
from shapely.geometry import Point, Polygon, LineString
import geopandas as gpd
import mplleaflet
import seaborn as sns

## Conect to database and import data

In [71]:
dias_da_semana = {
 'segunda': ['20240429', '20240506'],
 'terça': ['20240430', '20240507'],
 'quarta': ['20240424', '20240501', '20240508'],
 'quinta': ['20240425', '20240502', '20240509'],
 'sexta': ['20240426', '20240503', '20240510'],
 'sábado': ['20240427', '20240504', '20240511'],
 'domingo': ['20240428', '20240505']
}
linha_id = 422
dia = 'segunda'

In [178]:
# Conectar ao banco de dados PostgreSQL
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

between_clauses = " OR ".join([f"(datahora BETWEEN '{data} 00:00:00' AND '{data} 23:59:59')" for data in dias_da_semana[dia]])

query = f'''select 
                *, 
                ST_Transform(geom::geometry, 4326) AS geometry 
            from vehicle_tracking_filtered where 
            linha = '{linha_id}' and ({between_clauses})
        '''

## Track the trajectory with Grids (422 example)

#### Generate grid for 422

In [179]:

# Load data into a GeoDataFrame
gdf = gpd.read_postgis(query, conn, geom_col='geometry', crs='EPSG:4326')

conn.close()


  df = pd.read_sql(


In [181]:
def create_grid(gdf=None, bounds=None, n_cells=10, overlap=False, crs="EPSG:4326"):

    import geopandas as gpd
    import shapely

    if bounds != None:
        xmin, ymin, xmax, ymax= bounds
    else:
        xmin, ymin, xmax, ymax= gdf.total_bounds

    # get cell size
    cell_size = (xmax-xmin)/n_cells
    # create the cells in a loop
    grid_cells = []
    for x0 in np.arange(xmin, xmax+cell_size, cell_size ):
        for y0 in np.arange(ymin, ymax+cell_size, cell_size):
            x1 = x0-cell_size
            y1 = y0+cell_size
            poly = shapely.geometry.box(x0, y0, x1, y1)
            #print (gdf.overlay(poly, how='intersection'))
            grid_cells.append( poly )

    cells = gpd.GeoDataFrame(grid_cells, columns=['geometry'],
                                     crs=crs)
    if overlap == True:
        cols = ['grid_id','geometry','grid_area']
        cells = cells.sjoin(gdf, how='inner').drop_duplicates('geometry')
    return cells

In [220]:
rio_minx, rio_miny = -43.7955, -23.0824
rio_maxx, rio_maxy = -43.1039, -22.7448

grid = create_grid(bounds=(rio_minx, rio_miny, rio_maxx, rio_maxy), n_cells=2000)
grid = grid.reset_index(names='grid_id')

In [183]:
grid.head()

Unnamed: 0,grid_id,geometry
0,0,"POLYGON ((-43.79619 -23.08240, -43.79619 -23.0..."
1,1,"POLYGON ((-43.79619 -23.08171, -43.79619 -23.0..."
2,2,"POLYGON ((-43.79619 -23.08102, -43.79619 -23.0..."
3,3,"POLYGON ((-43.79619 -23.08033, -43.79619 -23.0..."
4,4,"POLYGON ((-43.79619 -23.07963, -43.79619 -23.0..."


In [222]:
grid.shape

(1956978, 2)

In [184]:
gdf = gdf.dropna()

In [221]:
gdf.shape

(283014, 14)

## Search for outliers and Garage paths

In [223]:
def stat_day_per_line(gdf, grid):
    gdf.loc[:, 'datahora'] = pd.to_datetime(gdf['datahora'])
    gdf.loc[:, 'hour'] = gdf.loc[:, 'datahora'].dt.hour
    gdf = gdf.set_geometry('geometry')
    grid = grid.set_geometry('geometry')
    grid_joined = grid.sjoin(gdf, how='inner', predicate='contains')
    aggregated = grid_joined.groupby(['grid_id', 'geometry']).agg(
        count=('geometry', 'size'),
        median_time=('hour', 'median'),
        median_velocidade=('velocidade', 'median'),
        centroid=('geometry', lambda x: x.unary_union.centroid)
    ).reset_index()
 
    return aggregated


In [224]:
grids_stats = stat_day_per_line(gdf, grid)
grids_stats.shape

(2767, 6)

In [294]:
grid_filtered = grids_stats[(grids_stats['count'] > grids_stats['count'].quantile(0.5))   & (grids_stats['median_time'] > 7) & (grids_stats['median_time'] < 20) & (grids_stats['median_velocidade'] > 0)]

In [229]:
grid_filtered.shape

(1259, 6)

In [230]:
map_center = [gdf['geometry'].y.mean(), gdf['geometry'].x.mean()]
m = folium.Map(location=map_center, zoom_start=15)

In [231]:

# Add grid cells to the map
for _, row in grid_filtered.iterrows():
    folium.GeoJson(row.geometry).add_to(m)
    folium.Marker(location=[row['geometry'].centroid.y, row['geometry'].centroid.x],
                      icon=folium.DivIcon(html=f'<div style="font-size: 5pt">{row["grid_id"]}</div>')).add_to(m)
    folium.Circle(location=[row.centroid.y, row.centroid.x],
                            radius=3,
                            color='red',
                            fill=True,
                            fill_color='red').add_to(m)
    

In [232]:
m

## Drawing Trajectories

In [338]:
from collections import defaultdict

def create_trajectory_for_line(gdf, grids, buffer_size=0.005):
    gdf = gdf.set_geometry('geometry')
    grids = grids.set_geometry('geometry')
    # Realizar um join entre gdf e all_point_counts
    joined = gpd.sjoin(grids, gdf, how='inner', predicate='contains')
    # Inicializar uma lista para armazenar as linhas da trajetória
    lines = []
    grids = grids.set_index('grid_id')
   
    unique_grids = joined['grid_id'].unique()

    # Inicializar um DataFrame para armazenar as médias das diferenças de tempo
    time_diffs_df = pd.DataFrame(index=unique_grids, columns=unique_grids)

    # Calcular as distâncias entre todos os pares de centroides
    dist_matrix = grids.apply(lambda row: grids.distance(row['centroid']), axis=1)

        # Filtrar os grids próximos com base no buffer
    nearby_grids_dict = {grid_id: dist_matrix[grid_id][dist_matrix[grid_id] < buffer_size].index.tolist() for grid_id in unique_grids}
    time_diffs_list = []
    # Iterar sobre cada grid e seus grids próximos
    for grid_a_id in unique_grids:
        print(grid_a_id)
        print(grid_a_id)
        grid_a_points = joined[joined['grid_id'] == grid_a_id]
        grid_a_points = grid_a_points[['ordem', 'datahora']].set_index('ordem')
        
        for grid_b_id in nearby_grids_dict[grid_a_id]:
            grid_b_points = joined[joined['grid_id'] == grid_b_id]
            grid_b_points = grid_b_points[['ordem', 'datahora']].set_index('ordem')
            
            # Encontrar os ônibus que passaram por ambas as grids
            common_buses = grid_a_points.index.intersection(grid_b_points.index)
            
            if not common_buses.empty:
                time_diffs = (grid_b_points.loc[common_buses] - grid_a_points.loc[common_buses]).dropna()
                time_diffs = time_diffs[time_diffs['datahora'] > pd.Timedelta(0)]
                if not time_diffs.empty:
                    min_time_diff = time_diffs.groupby('ordem').min()
                    for ordem, diff in min_time_diff.iterrows():
                        time_diffs_list.append((grid_a_id, grid_b_id, diff['datahora'].total_seconds()))

    # Converter a lista para DataFrame
    time_diffs_df_intermediate = pd.DataFrame(time_diffs_list, columns=['grid_id_a', 'grid_id_b', 'time_diff_seconds'])

    # Calcular a mediana das diferenças de tempo
    median_time_diffs = time_diffs_df_intermediate.groupby(['grid_id_a', 'grid_id_b'])['time_diff_seconds'].median().reset_index()

    # Preencher o DataFrame final usando pivot_table
    time_diffs_df = median_time_diffs.pivot_table(index='grid_id_a', columns='grid_id_b', values='time_diff_seconds')

    # Criar as conexões entre as grids com a menor média das diferenças de tempo
    connections = time_diffs_df.idxmin(axis=1).reset_index()
    connections.columns = ['grid_a_id', 'best_grid_b_id']
    # Criar as linhas da sequência de grids
    lines = []
    for _, row in connections.iterrows():
        grid_a_centroid = grids.loc[row['grid_a_id'], 'centroid']
        grid_b_centroid = grids.loc[row['best_grid_b_id'], 'centroid']
        line = LineString([grid_a_centroid, grid_b_centroid])
        lines.append(line)

    return lines

In [302]:
traj = create_trajectory_for_line(gdf, grid_filtered)


  dist_matrix = grids.apply(lambda row: grids.distance(row['centroid']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_merged['time_diff'] = filtered_merged['datahora_b'] - filtered_merged['datahora_a']


In [307]:
trajectory = LineString([point for line in traj for point in line.coords])

In [304]:
map_center = [gdf['geometry'].y.mean(), gdf['geometry'].x.mean()]
m = folium.Map(location=map_center, zoom_start=15)

In [308]:

# Add grid cells to the map
for _, row in grid_filtered.iterrows():
    folium.GeoJson(row.geometry).add_to(m)
    folium.GeoJson(trajectory).add_to(m)
    folium.Marker(location=[row['geometry'].centroid.y, row['geometry'].centroid.x],
                      icon=folium.DivIcon(html=f'<div style="font-size: 5pt">{row["grid_id"]}</div>')).add_to(m)
    folium.Circle(location=[row.centroid.y, row.centroid.x],
                            radius=3,
                            color='red',
                            fill=True,
                            fill_color='red').add_to(m)
    

In [332]:
gdf = gdf.set_geometry('geometry')
grids = grid_filtered.set_geometry('geometry')
# Realizar um join entre gdf e all_point_counts
joined = gpd.sjoin(grids, gdf, how='inner', predicate='contains')
# Inicializar uma lista para armazenar as linhas da trajetória
lines = []
grids = grids.set_index('grid_id')

unique_grids = joined['grid_id'].unique()

# Inicializar um DataFrame para armazenar as médias das diferenças de tempo
time_diffs_df = pd.DataFrame(index=unique_grids, columns=unique_grids)

# Calcular as distâncias entre todos os pares de centroides
dist_matrix = grids.apply(lambda row: grids.distance(row['centroid']), axis=1)

    # Filtrar os grids próximos com base no buffer
nearby_grids_dict = {grid_id: dist_matrix[grid_id][dist_matrix[grid_id] < 0.01].index.tolist() for grid_id in unique_grids}

# Agrupar os dados por 'grid_id' e 'ordem'





  dist_matrix = grids.apply(lambda row: grids.distance(row['centroid']), axis=1)
