In [50]:
import pandas as pd
import plotly.express as px
from lib.utils import df_from_jsonl, print_neighbours

In [51]:
# read from file with preprocessed data into df
data_file = 'pre_bigdata'
df = df_from_jsonl(data_file)

In [52]:
df.sort_values(by=['VehicleNumber', 'Time'], ascending=[True, True], inplace=True, ignore_index=True)
df.drop_duplicates(['VehicleNumber', 'Time'], inplace=True) # possible duplicate measurements
# todo: could drop vehicles with too few measurements, learn groupby
vc = df.value_counts('VehicleNumber')
# print(vc[0:20])
# print(vc)
pop_nr = vc.index[0]
pop_df = df[df['VehicleNumber'] == pop_nr]
pop_vc = pop_df.value_counts('Lines')
print(pop_vc)
line = pop_vc.index[0]

small_df = df[(df['Lines'] == line)]
print(small_df)
# lons = small_df.value_counts('Lon')
# lon_pop = lons.index[0]
# print(small_df[small_df['Lon'] == lon_pop])

# full_df = df
# df = small_df

Lines
190    1218
Name: count, dtype: int64
        Lines        Lon  VehicleNumber                Time        Lat Brigade
532253    190  21.047438           3403 2024-02-22 09:05:50  52.264145       1
532301    190  21.047263           3403 2024-02-22 09:37:55  52.264097       1
532302    190  21.047279           3403 2024-02-22 09:40:25  52.264075       1
532304    190  21.047279           3403 2024-02-22 09:40:40  52.264072       1
532305    190  21.047313           3403 2024-02-22 09:40:50  52.264053       1
...       ...        ...            ...                 ...        ...     ...
1838924   190  21.084403           8593 2024-02-22 14:27:42  52.297762     017
1838925   190  21.086327           8593 2024-02-22 14:27:57  52.300085     017
1838927   190  21.087035           8593 2024-02-22 14:28:12  52.300892     017
1838928   190  21.087035           8593 2024-02-22 14:28:23  52.300892     017
1838930   190  21.087910           8593 2024-02-22 14:29:01  52.302201     017

[19410 

In [53]:
# drop unupdated gps positions
def drop_gps_fails_v1(df):
    df.reset_index(drop=True, inplace=True)
    # todo dropping duplicates above becomes unnecessary
    all_count = len(df)
    dropped_count = 0
    how_far_to_look = 1
    for i in range(1, len(df)): # start from 1 to compare with previous
        prev = i - how_far_to_look
        drop_cond =  (df.at[i, 'Lon'] == df.at[prev, 'Lon']
                    and df.at[i, 'Lat'] == df.at[prev, 'Lat']
                    and df.at[i, 'VehicleNumber'] == df.at[prev, 'VehicleNumber'])
        if drop_cond:
            df.drop(i, inplace=True)
            how_far_to_look += 1
            dropped_count += 1
        else:
            how_far_to_look = 1
    print(f'Dropped {dropped_count} out of {all_count} rows')

def drop_gps_fails_v2(df):
    df.reset_index(drop=True, inplace=True)
    # todo dropping duplicates above becomes unnecessary
    all_count = len(df)
    dropped_count = 0
    how_far_to_look = 1
    for i in range(1, len(df)): # start from 1 to compare with previous
        prev = i - how_far_to_look
        drop_cond =  (df.at[i, 'Lon'] == df.at[prev, 'Lon']
                    and df.at[i, 'Lat'] == df.at[prev, 'Lat']
                    and df.at[i, 'VehicleNumber'] == df.at[prev, 'VehicleNumber'])
        if drop_cond:
            # df.drop(i, inplace=True)
            df.at[i, 'Lon'] = None
            how_far_to_look += 1
            dropped_count += 1
        else:
            how_far_to_look = 1
    print(f'Dropped {dropped_count} out of {all_count} rows')
    df.dropna(subset=['Lon'], inplace=True)

# drop_gps_fails_v2(df)
# df.reset_index(drop=True, inplace=True)

In [54]:
# prepare deltas dataframe
# note: df isn't changed in the process
df_prev = df.shift(1)
# todo: could guarantee vehicles are == while merging
deltas_df = df.merge(df_prev, how='outer', left_index=True, right_index=True, suffixes=('', '_prev'))
deltas_df = deltas_df[(deltas_df['VehicleNumber'] == deltas_df['VehicleNumber_prev'])
                & (deltas_df['Time'] != deltas_df['Time_prev'])]
deltas_df.drop(columns=['VehicleNumber_prev'], inplace=True)

In [55]:
# calculate distances and time differences
from lib.distance import earth_distance_km, warsaw_distance_km, warsaw_numbers
(km_lon, km_lat) = warsaw_numbers()

deltas_df['Time_diff'] = (deltas_df['Time'] - deltas_df['Time_prev']).dt.total_seconds()
deltas_df = deltas_df[deltas_df['Time_diff'] > 9]

In [56]:
deltas_df['Dist_pythagoras'] = ( ((deltas_df['Lat'] - deltas_df['Lat_prev'])*km_lat)**2 + ((deltas_df['Lon'] - deltas_df['Lon_prev'])*km_lon)**2  )**0.5
# slow pythagoras:
# deltas_df['Dist_pythagoras'] = deltas_df.apply(lambda row: warsaw_distance_km((row['Lon'], row['Lat']), (row['Lon_prev'], row['Lat_prev'])), axis=1)
deltas_df['velocity_p'] = (deltas_df['Dist_pythagoras'] / deltas_df['Time_diff']) * 3600
deltas_df = deltas_df[deltas_df['velocity_p'] < 90]

In [57]:
# haversine is soo slow
# deltas_df['Dist_haversine'] = deltas_df.apply(lambda row: earth_distance_km((row['Lon'], row['Lat']), (row['Lon_prev'], row['Lat_prev'])), axis=1)
# deltas_df['velocity_h'] = deltas_df['Dist_haversine'] / deltas_df['Time_diff'] * 3600

In [58]:
# before calculating velocities:
# TODO! filter out too big time diffs
# TODO! filter out stale positions
# deltas_df.describe()

In [59]:
# fig = px.line(deltas_df, x='Time', y='velocity_p', title='Velocity over time')
# fig.show()
# deltas_df['smoothed'] = deltas_df['velocity_p'].rolling(window=7).mean()
# fig = px.line(deltas_df, x='Time', y='smoothed', title='Smoothed velocity over time')
# fig.show()
# deltas_df['expon_smoothed'] = deltas_df['velocity_p'].ewm(span=7).mean()
# fig = px.line(deltas_df, x='Time', y='expon_smoothed', title='Velocity over time')
# fig.show()
# deltas_df['expon_smoothed2'] = deltas_df['velocity_p'].ewm(span=3).mean()
# fig = px.line(deltas_df, x='Time', y='expon_smoothed', title='Velocity over time')
# fig.show()

In [60]:
import plotly.graph_objects as go

# # scattergeo plot
# fig = go.Figure()

# fig.add_trace(go.Scattergeo(
#     lon = deltas_df['Lon'],
#     lat = deltas_df['Lat'],
#     mode = 'markers',
#     marker=dict(
#         size=8,
#         opacity=0.8,
#         reversescale=True,
#         autocolorscale=False,
#         colorscale='RdYlBu',  # nice colorscale
#         cmin=min(deltas_df['smoothed']),
#         color=deltas_df['smoothed'],
#         cmax=max(deltas_df['smoothed']),
#         colorbar_title='Velocity'
#     )
# ))

# fig.update_layout(
#     title='Velocity Map',
#     geo=dict(
#         showland=True,
#         landcolor="rgb(250, 250, 250)",
#         center=dict(lon=21.0122, lat=52.2297),
#         showcountries=True,
#         countrycolor="rgb(100, 100, 100)",
#         projection_type="equirectangular",
#         fitbounds="locations",
#     )
# )

# fig.show()

In [63]:
factor = 1
lon_step = 0.0025 * factor
lat_step = 0.0025 * factor

dd = deltas_df.copy()
dd['Lon_grid'] = (dd['Lon'] // lon_step) * lon_step
dd['Lat_grid'] = (dd['Lat'] // lat_step) * lat_step
averages = dd.groupby(['Lon_grid', 'Lat_grid'])['velocity_p'].mean().reset_index()
# print(averages.head(2))

fig = go.Figure()

fig.add_trace(go.Scattergeo(
    lon=averages['Lon_grid'],
    lat=averages['Lat_grid'],
    text=averages['velocity_p'].astype(str) + 'km/h',
    mode='markers',
    marker=dict(
        size=5, # could be proportional to something!
        opacity=0.8,
        reversescale=True,
        autocolorscale=False,
        colorscale='RdYlBu',
        cmin=averages['velocity_p'].min(),
        color=averages['velocity_p'],
        cmax=averages['velocity_p'].max(),
        colorbar_title='Average velocity'
    )
))

fig.update_layout(
    title='Average Velocity Map',
    geo=dict(
        showland=True,
        landcolor="rgb(250, 250, 250)",
        showcountries=True,
        countrycolor="rgb(100, 100, 100)",
        projection_type="equirectangular",
        fitbounds="locations",
    )
)

fig.show()

In [62]:
raise UserWarning('stop here')

UserWarning: stop here

In [None]:
df188 = deltas_df[deltas_df['Lines'] == '188'].copy()
df188['VehicleNumber'].drop_duplicates()
# df188[df188['VehicleNumber'] == 2226]

Series([], Name: VehicleNumber, dtype: int64)

In [None]:
df188[df188['VehicleNumber'] == 8842]

Unnamed: 0,Lines,Lon,VehicleNumber,Time,Lat,Brigade,Lines_prev,Lon_prev,Time_prev,Lat_prev,Brigade_prev,Time_diff,Dist_pythagoras,velocity_p
900637,188,20.979631,8842,2024-02-22 09:29:54,52.173820,07,188,20.979025,2024-02-22 09:28:56,52.169480,07,58.0,0.483526,30.011967
900638,188,20.978821,8842,2024-02-22 09:30:29,52.177735,07,188,20.979615,2024-02-22 09:29:59,52.174328,07,30.0,0.382107,45.852835
900639,188,20.978623,8842,2024-02-22 09:30:39,52.178550,07,188,20.978821,2024-02-22 09:30:29,52.177735,07,10.0,0.091437,32.917151
900640,188,20.978521,8842,2024-02-22 09:30:54,52.178989,07,188,20.978623,2024-02-22 09:30:39,52.178550,07,15.0,0.049162,11.798971
900641,188,20.978430,8842,2024-02-22 09:31:04,52.179356,07,188,20.978521,2024-02-22 09:30:54,52.178989,07,10.0,0.041220,14.839283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900924,188,21.115807,8842,2024-02-22 10:50:08,52.233854,07,188,21.115800,2024-02-22 10:49:58,52.233858,07,10.0,0.000702,0.252587
900925,188,21.115807,8842,2024-02-22 10:50:23,52.233849,07,188,21.115807,2024-02-22 10:50:08,52.233854,07,15.0,0.000466,0.111936
900926,188,21.115784,8842,2024-02-22 13:05:03,52.234031,07,188,21.115804,2024-02-22 10:50:28,52.233849,07,8075.0,0.020249,0.009028
900927,188,21.115799,8842,2024-02-22 13:05:13,52.233871,07,188,21.115784,2024-02-22 13:05:03,52.234031,07,10.0,0.017823,6.416288


In [None]:
# filter out too big velocities
threshold = 120
# TODO! filter out stale positions instead
count = len(deltas_df)
deltas_df = deltas_df[deltas_df['velocity_p'] < threshold]
count2 = len(deltas_df)
removed = count - count2
print(f"Removed {removed} out of {count} entries ({(removed)/count * 100}%).")
print(f" (Too fast -- over {threshold} km/h)")

Removed 246 out of 1242788 entries (0.01979420464310888%).
 (Too fast -- over 120 km/h)


In [None]:
deltas_df.reset_index(inplace=True, drop=True)
# no removing from deltas_bf below

In [None]:
velo_max_idx = deltas_df['velocity_p'].idxmax()
print_neighbours(deltas_df, velo_max_idx)

Neighbours of 469327 from 469322 to 469332
Neighbor 469322
Lines                              500
Lon                          21.000551
VehicleNumber                     5449
Time               2024-02-22 10:13:13
Lat                          52.258583
Brigade                              6
Lines_prev                         500
Lon_prev                     20.997425
Time_prev          2024-02-22 10:12:58
Lat_prev                     52.258392
Brigade_prev                         6
Time_diff                         15.0
Dist_pythagoras               0.214049
velocity_p                   51.371721
Name: 469322, dtype: object
Neighbor 469323
Lines                              500
Lon                          21.002169
VehicleNumber                     5449
Time               2024-02-22 10:13:23
Lat                          52.258484
Brigade                              6
Lines_prev                         500
Lon_prev                     21.000551
Time_prev          2024-02-22 10:13:13


In [None]:
print(deltas_df.columns)
print(deltas_df[['Dist_pythagoras', 'Time_diff', 'velocity_p']])
print(deltas_df[['Dist_haversine', 'Time_diff', 'velocity_h']])

deltas_df['relative_diff'] = abs(deltas_df['Dist_haversine'] - deltas_df['Dist_pythagoras']) / deltas_df['Dist_haversine']
index_max = deltas_df['relative_diff'].idxmax()
print("Max relative difference between haversine and pythagoras: ", deltas_df.loc[index_max]['relative_diff'])
print("The rest:")
print(deltas_df.loc[index_max])

index_velo_max = deltas_df['velocity_p'].idxmax()
print("Max velocity:", deltas_df.loc[index_velo_max]['velocity_p'])
print("The rest:")
print(deltas_df.loc[index_velo_max])
print("Neighbours:")
print_neighbours(deltas_df, index_velo_max)


print()
print(deltas_df['velocity_p'].describe())
print()
print(deltas_df['velocity_h'].describe())
print()
print(deltas_df['Time'].describe())
print()
print(deltas_df['Time_diff'].describe())


Index(['Lines', 'Lon', 'VehicleNumber', 'Time', 'Lat', 'Brigade', 'Lines_prev',
       'Lon_prev', 'Time_prev', 'Lat_prev', 'Brigade_prev', 'Time_diff',
       'Dist_pythagoras', 'velocity_p', 'Dist_haversine', 'velocity_h'],
      dtype='object')
         Dist_pythagoras  Time_diff  velocity_p
0               0.019910       18.0    3.981946
1               0.260158       27.0   34.687691
2               0.033994       10.0   12.237983
3               0.062408       14.0   16.047746
4               0.072192       20.0   12.994493
...                  ...        ...         ...
1339685         0.337392      120.0   10.121773
1339686         1.006629      120.0   30.198871
1339687         1.051773      120.0   31.553182
1339688         0.448634      120.0   13.459029
1339689         0.207074      120.0    6.212214

[1339690 rows x 3 columns]
         Dist_haversine  Time_diff  velocity_h
0              0.019921       18.0    3.984240
1              0.260402       27.0   34.720270
2      