In [None]:
import pandas as pd
from lib.utils import df_from_jsonl, print_neighbours

In [None]:
# read from file with preprocessed data into df
data_file = 'pre_bigdata'
df = df_from_jsonl(data_file)

In [None]:
df.sort_values(by=['VehicleNumber', 'Time'], ascending=[True, True], inplace=True, ignore_index=True)
df.drop_duplicates(['VehicleNumber', 'Time'], inplace=True) # possible duplicate measurements
# todo: could drop vehicles with too few measurements, learn groupby

In [None]:
# drop unupdated gps positions
def drop_gps_fails_v1(df):
    df.reset_index(drop=True, inplace=True)
    # todo dropping duplicates above becomes unnecessary
    all_count = len(df)
    dropped_count = 0
    how_far_to_look = 1
    for i in range(1, len(df)): # start from 1 to compare with previous
        prev = i - how_far_to_look
        drop_cond =  (df.at[i, 'Lon'] == df.at[prev, 'Lon']
                    and df.at[i, 'Lat'] == df.at[prev, 'Lat']
                    and df.at[i, 'VehicleNumber'] == df.at[prev, 'VehicleNumber'])
        if drop_cond:
            df.drop(i, inplace=True)
            how_far_to_look += 1
            dropped_count += 1
        else:
            how_far_to_look = 1
    print(f'Dropped {dropped_count} out of {all_count} rows')

def drop_gps_fails_v2(df):
    df.reset_index(drop=True, inplace=True)
    # todo dropping duplicates above becomes unnecessary
    all_count = len(df)
    dropped_count = 0
    how_far_to_look = 1
    for i in range(1, len(df)): # start from 1 to compare with previous
        prev = i - how_far_to_look
        drop_cond =  (df.at[i, 'Lon'] == df.at[prev, 'Lon']
                    and df.at[i, 'Lat'] == df.at[prev, 'Lat']
                    and df.at[i, 'VehicleNumber'] == df.at[prev, 'VehicleNumber'])
        if drop_cond:
            # df.drop(i, inplace=True)
            df.at[i, 'Lon'] = None
            how_far_to_look += 1
            dropped_count += 1
        else:
            how_far_to_look = 1
    print(f'Dropped {dropped_count} out of {all_count} rows')
    df.dropna(subset=['Lon'], inplace=True)

# drop_gps_fails_v2(df)
# df.reset_index(drop=True, inplace=True)

In [None]:
# prepare deltas dataframe
# note: df isn't changed in the process
df_prev = df.shift(1)
# todo: could guarantee vehicles are == while merging
deltas_df = df.merge(df_prev, how='outer', left_index=True, right_index=True, suffixes=('', '_prev'))
deltas_df = deltas_df[(deltas_df['VehicleNumber'] == deltas_df['VehicleNumber_prev'])
                & (deltas_df['Time'] != deltas_df['Time_prev'])]
deltas_df.drop(columns=['VehicleNumber_prev'], inplace=True)

In [None]:
# calculate distances and time differences
from lib.distance import earth_distance_km, warsaw_distance_km, warsaw_numbers
(km_lon, km_lat) = warsaw_numbers()

deltas_df['Time_diff'] = (deltas_df['Time'] - deltas_df['Time_prev']).dt.total_seconds()
deltas_df = deltas_df[deltas_df['Time_diff'] > 9]

In [None]:
deltas_df['Dist_pythagoras'] = ( ((deltas_df['Lat'] - deltas_df['Lat_prev'])*km_lat)**2 + ((deltas_df['Lon'] - deltas_df['Lon_prev'])*km_lon)**2  )**0.5
# slow pythagoras:
# deltas_df['Dist_pythagoras'] = deltas_df.apply(lambda row: warsaw_distance_km((row['Lon'], row['Lat']), (row['Lon_prev'], row['Lat_prev'])), axis=1)
deltas_df['velocity_p'] = (deltas_df['Dist_pythagoras'] / deltas_df['Time_diff']) * 3600

In [None]:
# haversine is soo slow
deltas_df['Dist_haversine'] = deltas_df.apply(lambda row: earth_distance_km((row['Lon'], row['Lat']), (row['Lon_prev'], row['Lat_prev'])), axis=1)
deltas_df['velocity_h'] = deltas_df['Dist_haversine'] / deltas_df['Time_diff'] * 3600

In [None]:
# before calculating velocities:
# TODO! filter out too big time diffs
# TODO! filter out stale positions

In [None]:
df188 = deltas_df[deltas_df['Lines'] == '188'].copy()
df188['VehicleNumber'].drop_duplicates()
# df188[df188['VehicleNumber'] == 2226]

In [None]:
df188[df188['VehicleNumber'] == 8842]

In [None]:
# filter out too big velocities
threshold = 120
# TODO! filter out stale positions instead
count = len(deltas_df)
deltas_df = deltas_df[deltas_df['velocity_p'] < threshold]
count2 = len(deltas_df)
removed = count - count2
print(f"Removed {removed} out of {count} entries ({(removed)/count * 100}%).")
print(f" (Too fast -- over {threshold} km/h)")

In [None]:
deltas_df.reset_index(inplace=True, drop=True)
# no removing from deltas_bf below

In [None]:
velo_max_idx = deltas_df['velocity_p'].idxmax()
print_neighbours(deltas_df, velo_max_idx)

In [None]:
print(deltas_df.columns)
print(deltas_df[['Dist_pythagoras', 'Time_diff', 'velocity_p']])
print(deltas_df[['Dist_haversine', 'Time_diff', 'velocity_h']])

deltas_df['relative_diff'] = abs(deltas_df['Dist_haversine'] - deltas_df['Dist_pythagoras']) / deltas_df['Dist_haversine']
index_max = deltas_df['relative_diff'].idxmax()
print("Max relative difference between haversine and pythagoras: ", deltas_df.loc[index_max]['relative_diff'])
print("The rest:")
print(deltas_df.loc[index_max])

index_velo_max = deltas_df['velocity_p'].idxmax()
print("Max velocity:", deltas_df.loc[index_velo_max]['velocity_p'])
print("The rest:")
print(deltas_df.loc[index_velo_max])
print("Neighbours:")
print_neighbours(deltas_df, index_velo_max)


print()
print(deltas_df['velocity_p'].describe())
print()
print(deltas_df['velocity_h'].describe())
print()
print(deltas_df['Time'].describe())
print()
print(deltas_df['Time_diff'].describe())


In [None]:
import plotly.graph_objects as go

raise Exception("Don't draw the map with bajillion points")
# Create a Plotly scattergeo plot
fig = go.Figure()

# Add scattergeo trace
fig.add_trace(go.Scattergeo(
    lon = deltas_df['Lon'],
    lat = deltas_df['Lat'],
    mode = 'markers',
    marker=dict(
        size=10, # TODO: smaller
        opacity=0.8,
        reversescale=True,
        autocolorscale=False,
        colorscale='RdYlBu',  # Choose a suitable colorscale
        cmin=min(deltas_df['velocity_p']),
        color=deltas_df['velocity_p'],
        cmax=max(deltas_df['velocity_p']),
        colorbar_title='Velocity'
    )
))

# Update layout
fig.update_layout(
    title='Velocity Map',
    geo=dict(
        showland=True,
        landcolor="rgb(250, 250, 250)",
        showcountries=True,
        countrycolor="rgb(100, 100, 100)",
        projection_type="equirectangular"
    )
)

fig.show()