In [None]:
import pandas as pd
import plotly.express as px
from lib.utils import df_from_jsonl, print_neighbours

In [None]:
# read from file with preprocessed data into df
data_file = 'pre_bigdata'
df = df_from_jsonl(data_file)

In [None]:
df.sort_values(by=['VehicleNumber', 'Time'], ascending=[True, True], inplace=True, ignore_index=True)
df.drop_duplicates(['VehicleNumber', 'Time'], inplace=True) # possible duplicate measurements
# todo: could drop vehicles with too few measurements, learn groupby
vc = df.value_counts('VehicleNumber')
# print(vc[0:20])

pop_nr = vc.index[0]
pop_df = df[df['VehicleNumber'] == pop_nr]
pop_vc = pop_df.value_counts('Lines')
print(pop_vc)
line = pop_vc.index[0]

# only one line:
small_df = df[(df['Lines'] == line)]
# only one vehicle
# small_df = df[(df['Lines'] == line) & df['VehicleNumber'] == pop_nr]

# print(small_df)

# if want to see only one line, swap df
# full_df = df
# df = small_df

In [None]:
# drop unupdated gps positions
def drop_gps_fails_v1(df):
    df.reset_index(drop=True, inplace=True)
    # todo dropping duplicates above becomes unnecessary
    all_count = len(df)
    dropped_count = 0
    how_far_to_look = 1
    for i in range(1, len(df)): # start from 1 to compare with previous
        prev = i - how_far_to_look
        drop_cond =  (df.at[i, 'Lon'] == df.at[prev, 'Lon']
                    and df.at[i, 'Lat'] == df.at[prev, 'Lat']
                    and df.at[i, 'VehicleNumber'] == df.at[prev, 'VehicleNumber'])
        if drop_cond:
            df.drop(i, inplace=True)
            how_far_to_look += 1
            dropped_count += 1
        else:
            how_far_to_look = 1
    print(f'Dropped {dropped_count} out of {all_count} rows')

def drop_gps_fails_v2(df):
    df.reset_index(drop=True, inplace=True)
    # todo dropping duplicates above becomes unnecessary
    all_count = len(df)
    dropped_count = 0
    how_far_to_look = 1
    for i in range(1, len(df)): # start from 1 to compare with previous
        prev = i - how_far_to_look
        drop_cond =  (df.at[i, 'Lon'] == df.at[prev, 'Lon']
                    and df.at[i, 'Lat'] == df.at[prev, 'Lat']
                    and df.at[i, 'VehicleNumber'] == df.at[prev, 'VehicleNumber'])
        if drop_cond:
            # df.drop(i, inplace=True)
            df.at[i, 'Lon'] = None
            how_far_to_look += 1
            dropped_count += 1
        else:
            how_far_to_look = 1
    print(f'Dropped {dropped_count} out of {all_count} rows')
    df.dropna(subset=['Lon'], inplace=True)

# drop_gps_fails_v2(df)
# df.reset_index(drop=True, inplace=True)

In [None]:
# prepare deltas dataframe
# note: df isn't changed in the process
df_prev = df.shift(1)
# todo: could guarantee vehicles are == while merging
deltas_df = df.merge(df_prev, how='outer', left_index=True, right_index=True, suffixes=('', '_prev'))
deltas_df = deltas_df[(deltas_df['VehicleNumber'] == deltas_df['VehicleNumber_prev'])
                & (deltas_df['Time'] != deltas_df['Time_prev'])]
deltas_df.drop(columns=['VehicleNumber_prev'], inplace=True)

In [None]:
# calculate distances and time differences
from lib.distance import earth_distance_km, warsaw_distance_km, warsaw_numbers
(km_lon, km_lat) = warsaw_numbers()

deltas_df['Time_diff'] = (deltas_df['Time'] - deltas_df['Time_prev']).dt.total_seconds()
deltas_df = deltas_df[deltas_df['Time_diff'] > 9]

In [None]:
deltas_df['Dist_pythagoras'] = ( ((deltas_df['Lat'] - deltas_df['Lat_prev'])*km_lat)**2 + ((deltas_df['Lon'] - deltas_df['Lon_prev'])*km_lon)**2  )**0.5
# slower pythagoras:
# deltas_df['Dist_pythagoras'] = deltas_df.apply(lambda row: warsaw_distance_km((row['Lon'], row['Lat']), (row['Lon_prev'], row['Lat_prev'])), axis=1)
deltas_df['velocity_p'] = (deltas_df['Dist_pythagoras'] / deltas_df['Time_diff']) * 3600
deltas_df = deltas_df[deltas_df['velocity_p'] < 90]

In [None]:
# haversine is soo slow
# deltas_df['Dist_haversine'] = deltas_df.apply(lambda row: earth_distance_km((row['Lon'], row['Lat']), (row['Lon_prev'], row['Lat_prev'])), axis=1)
# deltas_df['velocity_h'] = deltas_df['Dist_haversine'] / deltas_df['Time_diff'] * 3600

In [None]:
# note: doesn't really make sense with more than one bus
fig = px.line(deltas_df, x='Time', y='velocity_p', title='Velocity over time')
fig.show()
deltas_df['smoothed'] = deltas_df['velocity_p'].rolling(window=7).mean()
fig = px.line(deltas_df, x='Time', y='smoothed', title='Smoothed velocity over time')
fig.show()
# deltas_df['expon_smoothed'] = deltas_df['velocity_p'].ewm(span=7).mean()
# fig = px.line(deltas_df, x='Time', y='expon_smoothed', title='Velocity over time')
# fig.show()
# deltas_df['expon_smoothed2'] = deltas_df['velocity_p'].ewm(span=3).mean()
# fig = px.line(deltas_df, x='Time', y='expon_smoothed', title='Velocity over time')
# fig.show()

In [None]:
import plotly.graph_objects as go

# # scattergeo plot
# fig = go.Figure()

# fig.add_trace(go.Scattergeo(
#     lon = deltas_df['Lon'],
#     lat = deltas_df['Lat'],
#     mode = 'markers',
#     marker=dict(
#         size=8,
#         opacity=0.8,
#         reversescale=True,
#         autocolorscale=False,
#         colorscale='RdYlBu',  # nice colorscale
#         cmin=min(deltas_df['smoothed']),
#         color=deltas_df['smoothed'],
#         cmax=max(deltas_df['smoothed']),
#         colorbar_title='Velocity'
#     )
# ))

# fig.update_layout(
#     title='Velocity Map',
#     geo=dict(
#         showland=True,
#         landcolor="rgb(250, 250, 250)",
#         center=dict(lon=21.0122, lat=52.2297),
#         showcountries=True,
#         countrycolor="rgb(100, 100, 100)",
#         projection_type="equirectangular",
#         fitbounds="locations",
#     )
# )

# fig.show()

In [None]:
factor = 1
lon_step = 0.0025 * factor
lat_step = 0.0025 * factor

dd = deltas_df.copy()
dd['Lon_grid'] = (dd['Lon'] // lon_step) * lon_step
dd['Lat_grid'] = (dd['Lat'] // lat_step) * lat_step
# averages = dd.groupby(['Lon_grid', 'Lat_grid'])['velocity_p'].mean().reset_index()
counts = dd.groupby(['Lon_grid', 'Lat_grid']).agg({
    'velocity_p': ['mean', 'count']
}).reset_index()
counts.columns = ['Lon_grid', 'Lat_grid', 'velocity_p', 'count']
averages = counts
print(averages['count'].describe())

averages = averages[averages['count'] > 10]

fig = go.Figure()

fig.add_trace(go.Scattergeo(
    lon=averages['Lon_grid'],
    lat=averages['Lat_grid'],
    text=averages['velocity_p'].astype(str) + 'km/h',
    mode='markers',
    marker=dict(
        size=5, # could be proportional to something!
        opacity=0.8,
        reversescale=True,
        autocolorscale=False,
        colorscale='RdYlBu',
        cmin=averages['velocity_p'].min(),
        color=averages['velocity_p'],
        cmax=averages['velocity_p'].max(),
        colorbar_title='Average velocity'
    )
))

fig.update_layout(
    title='Average Velocity Map',
    geo=dict(
        showland=True,
        landcolor="rgb(250, 250, 250)",
        showcountries=True,
        countrycolor="rgb(100, 100, 100)",
        projection_type="equirectangular",
        fitbounds="locations",
    )
)

fig.show()