In [78]:
from src.subpipe.validate import ValidateBusData
import logging, os, sys, json
import pandas as pd

from src.utils.utils import (
    DATA_MONTH_DAY,
    SUBSCRIBER_DATA_PATH_JSON,
    SUBSCRIBER_FOLDER,
    curr_time_micro,
    sub_logger,
    lat_long_filler
)

logging.basicConfig(
        format="",
        filename=f"logs/notebook-{DATA_MONTH_DAY}.log",
        encoding="utf-8",
        filemode="a",
        level=logging.INFO,
)

In [95]:
df = pd.read_json(os.path.join(SUBSCRIBER_FOLDER, "04-12.json"))
df = df.sort_values(["VEHICLE_ID", "ACT_TIME"], ascending=True)

In [80]:
latitude_lowest_min = df['GPS_LONGITUDE'].min(axis=0) 
latitude_highest_max = df['GPS_LONGITUDE'].max(axis=0) 
latitude_low_bool = latitude_lowest_min > -124 # -124 or more
latitude_high_bool = latitude_highest_max <= -122  # -122 or less
try:
    result = (latitude_low_bool) & (latitude_high_bool)
    assert result.all() == True
except:
    sub_logger(
        f"{curr_time_micro()} LONGITUDE BAD!!!!! Longitude had the following min and max values: "
        + f"{latitude_lowest_min}, {latitude_highest_max}."
    )
else:
    sub_logger(
        f"{curr_time_micro()} LONGITUDE GOOD! Longitude sits within -122 and -124! Min and max vals are: "
        + f"{latitude_lowest_min}, {latitude_highest_max}."
    )

[05-08-2024-14:32:26.021] LONGITUDE GOOD! Longitude sits within -122 and -124! Min and max vals are: -123.115868, -122.372643.


In [81]:
latitude_lowest_min = df['GPS_LATITUDE'].min(axis=0) 
latitude_highest_max = df['GPS_LATITUDE'].max(axis=0) 
latitude_low_bool = latitude_lowest_min >= 45 # -124 or more
latitude_high_bool = latitude_highest_max < 46  # -122 or less
try:
    result = (latitude_low_bool) & (latitude_high_bool)
    assert result.all() == True
except:
    sub_logger(
        f"{curr_time_micro()} LATITUDE BAD!!!!! Latitude had the following min and max values: "
        + f"{latitude_lowest_min}, {latitude_highest_max}."
    )
else:
    sub_logger(
        f"{curr_time_micro()} LATITUDE GOOD! Latitude sits within 45 and 46! Min and max vals are: "
        + f"{latitude_lowest_min}, {latitude_highest_max}."
    )

[05-08-2024-14:32:26.042] LATITUDE GOOD! Latitude sits within 45 and 46! Min and max vals are: 45.318728, 45.639137.


In [82]:
gathered_HDOPs = df[(df['GPS_HDOP'] >= 4) & (df['GPS_HDOP'] < 23.1)]
these_HDOPs_nan = ((gathered_HDOPs['GPS_LONGITUDE'].isna()) 
                   & (gathered_HDOPs['GPS_LATITUDE'].isna()))
try:
    assert these_HDOPs_nan.all() == True
except:
    sub_logger(
        f"{curr_time_micro()} HDOP BAD!!!!! There were some HDOPs with non-nan values on lat and long: "
        + f"\n{df[(gathered_HDOPs.notna())]}"
    )
else:
    sub_logger(
        f"{curr_time_micro()} HDOP GOOD! All HDOP values 4 upto (not including) 23.1 are NaN on lat and long: "
        + f"."
    )

[05-08-2024-14:32:26.065] HDOP GOOD! All HDOP values 4 upto (not including) 23.1 are NaN on lat and long: .


In [83]:
sat_min = df['GPS_SATELLITES'].min()
try:
    assert sat_min == 0
except:
    sub_logger(
        f"{curr_time_micro()} GPS Min Sats BAD!!!!! The minimum number of satellites were: "
        + f"\n{sat_min}"
    )
else:
    sub_logger(
        f"{curr_time_micro()} GPS Min Sats GOOD! Minimum number of satellites was {sat_min}!"
    )

[05-08-2024-14:32:26.084] GPS Min Sats GOOD! Minimum number of satellites was 0!


In [84]:
sat_max = df['GPS_SATELLITES'].max()
try:
    assert sat_max == 12
except:
    sub_logger(
        f"{curr_time_micro()} GPS Max Sats BAD!!!!! The minimum number of satellites were: "
        + f"\n{sat_max}"
    )
else:
    sub_logger(
        f"{curr_time_micro()} GPS Max Sats GOOD! Minimum number of satellites was {sat_max}!"
    )

[05-08-2024-14:32:26.134] GPS Max Sats GOOD! Minimum number of satellites was 12!


In [85]:
nine_or_more_sats = df[df['GPS_SATELLITES'] == 0]
not_all_nan_lat = nine_or_more_sats[nine_or_more_sats['GPS_LATITUDE'].notna()]
not_all_nan_long = nine_or_more_sats[nine_or_more_sats['GPS_LONGITUDE'].notna()]
try:
    assert not_all_nan_lat['GPS_LATITUDE'].notna().any() == True
    assert not_all_nan_long['GPS_LONGITUDE'].notna().any() == True  
except:
    sub_logger(
        f"{curr_time_micro()} ZERO SATELLITES ASSERT BAD!!!! It seems that all "
        + f"0 GPS satellite vehicles are missing lat and long"
    )
else:
    sub_logger(
        f"{curr_time_micro()} ZERO SATELLITES ASSERT GOOD! It seems that SOME "
        + f"0 GPS satellite vehicles HAVE a lat and long"
    )

[05-08-2024-14:32:26.164] ZERO SATELLITES ASSERT GOOD! It seems that SOME 0 GPS satellite vehicles HAVE a lat and long


In [86]:
nine_or_more_sats = df[df['GPS_SATELLITES'] == 12]
all_yes_lat = nine_or_more_sats[nine_or_more_sats['GPS_LATITUDE'].notna()]
all_yes_long = nine_or_more_sats[nine_or_more_sats['GPS_LONGITUDE'].notna()]
try:
    assert all_yes_lat['GPS_LATITUDE'].notna().all() == True
    assert all_yes_long['GPS_LONGITUDE'].notna().all() == True  
except:
    sub_logger(
        f"{curr_time_micro()} TWELVE SATELLITES ASSERT BAD!!!! It seems that some "
        + f"12 GPS satellite vehicles are missing lat and long"
    )
else:
    sub_logger(
        f"{curr_time_micro()} TWELVE SATELLITES ASSERT GOOD! It seems that ALL "
        + f"12 GPS satellite vehicles HAVE a lat and long"
    )

[05-08-2024-14:32:26.219] TWELVE SATELLITES ASSERT GOOD! It seems that ALL 12 GPS satellite vehicles HAVE a lat and long


In [87]:
meters_bool = df['ACT_TIME'].isna().all() == False
try:
    assert meters_bool == True
except:
    sub_logger(
        f"{curr_time_micro()} ACTIVITY RECORD ASSERT BAD!!!! It seems that some "
        + f"records are missing an event activity time."
    )
else:
    sub_logger(
        f"{curr_time_micro()} ACTIVITY RECORD ASSERT GOOD! It seems that ALL "
        + f"records HAVE an event activity time."
    )

[05-08-2024-14:32:26.243] ACTIVITY RECORD ASSERT GOOD! It seems that ALL records HAVE an event activity time.


In [88]:
meters_bool = df['METERS'].isna().all() == False
try:
    assert meters_bool == True
except:
    sub_logger(
        f"{curr_time_micro()} METERS RECORD ASSERT BAD!!!! It seems that some "
        + f"records are missing an a meters metric."
    )
else:
    sub_logger(
        f"{curr_time_micro()} METERS RECORD ASSERT GOOD! It seems that ALL "
        + f"records HAVE an a meters metric."
    )

[05-08-2024-14:32:26.261] METERS RECORD ASSERT GOOD! It seems that ALL records HAVE an a meters metric.


In [89]:
df.insert(5, 'TIMESTAMP', 0)
df.head()

Unnamed: 0,EVENT_NO_TRIP,EVENT_NO_STOP,OPD_DATE,VEHICLE_ID,METERS,TIMESTAMP,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE,GPS_SATELLITES,GPS_HDOP
237139,219000623,219000624,13DEC2022:00:00:00,2905,37,0,22680,-122.843617,45.50406,12,0.8
237138,219000623,219000624,13DEC2022:00:00:00,2905,47,0,22685,-122.843743,45.504095,12,0.8
237137,219000623,219000624,13DEC2022:00:00:00,2905,54,0,22690,-122.843828,45.504118,12,0.8
237136,219000623,219000626,13DEC2022:00:00:00,2905,181,0,22740,-122.843408,45.50517,12,0.8
237135,219000623,219000626,13DEC2022:00:00:00,2905,196,0,22745,-122.843285,45.505283,11,0.9


In [90]:
date_format = "%d%b%Y:%H:%M:%S"
opd_sec: pd.Timestamp = pd.to_datetime(pd.to_datetime(df['OPD_DATE'], format=date_format), unit='s')
td: pd.Timedelta = pd.to_timedelta(df['ACT_TIME'], unit='sec')

df['TIMESTAMP'] = opd_sec + td

In [91]:
new_df = df.copy()

In [92]:
new_df.reset_index(level=0, inplace=True)
new_df.drop(columns=['index'], inplace=True)

In [94]:
new_df

Unnamed: 0,EVENT_NO_TRIP,EVENT_NO_STOP,OPD_DATE,VEHICLE_ID,METERS,TIMESTAMP,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE,GPS_SATELLITES,GPS_HDOP
0,219000623,219000624,13DEC2022:00:00:00,2905,37,2022-12-13 06:18:00,22680,-122.843617,45.504060,12,0.8
1,219000623,219000624,13DEC2022:00:00:00,2905,47,2022-12-13 06:18:05,22685,-122.843743,45.504095,12,0.8
2,219000623,219000624,13DEC2022:00:00:00,2905,54,2022-12-13 06:18:10,22690,-122.843828,45.504118,12,0.8
3,219000623,219000626,13DEC2022:00:00:00,2905,181,2022-12-13 06:19:00,22740,-122.843408,45.505170,12,0.8
4,219000623,219000626,13DEC2022:00:00:00,2905,196,2022-12-13 06:19:05,22745,-122.843285,45.505283,11,0.9
...,...,...,...,...,...,...,...,...,...,...,...
295349,218570609,218570626,13DEC2022:00:00:00,4305,190705,2022-12-13 18:15:03,65703,-122.844028,45.504288,12,0.8
295350,218570609,218570626,13DEC2022:00:00:00,4305,190736,2022-12-13 18:15:08,65708,-122.844162,45.504048,12,0.8
295351,218570609,218570626,13DEC2022:00:00:00,4305,190761,2022-12-13 18:15:13,65713,-122.844325,45.503868,12,0.8
295352,218570609,218570626,13DEC2022:00:00:00,4305,190784,2022-12-13 18:15:18,65718,-122.844518,45.503735,12,0.8


# Skip this. Save it for a challenge later. Trying to find all lat and long missing and approximating it

In [127]:
# na_gps_rows = df[(df['GPS_LONGITUDE'].isna()) | (df['GPS_LATITUDE'].isna())]
# vid_list_nans = na_gps_rows['VEHICLE_ID'].drop_duplicates(keep='first').tolist()

In [129]:
# def fill_lat_long(row):
#     if pd.isna(row['GPS_LATITUDE']) or pd.isna(row['GPS_LONGITUDE']):
#         prev_lat = df.at[row.name - 1, 'GPS_LATITUDE']
#         prev_long = df.at[row.name - 1, 'GPS_LONGITUDE']
#         a_distance = df.at[row.name - 1, 'METERS']
#         b_distance = row['METERS']
        
#         filled_lat, filled_long = lat_long_filler((prev_lat, prev_long), a_distance, b_distance)
#         return pd.Series({'GPS_LATITUDE': filled_lat, 'GPS_LONGITUDE': filled_long})
#     else:
#         return row[['GPS_LATITUDE', 'GPS_LONGITUDE']]

In [None]:
# for id in vid_list_nans:
#     cur_vid_rows = df[df['VEHICLE_ID'] == id]
#     if (cur_vid_rows['GPS_LATITUDE'].isna().any()) or (cur_vid_rows['GPS_LONGITUDE'].isna().any()):
#         caught_series = cur_vid_rows.apply(fill_lat_long, axis=1)
#         df[df['VEHICLE_ID'] == id] = caught_series