In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
sns.set(rc={'figure.figsize':(16,9)})

# Intro and basic datasets' stats

These are unique MAIDs seen in a Link viewshed with site id = mn-06-138318 

In [None]:
start_date, end_date = '2020-08-01 15:00:00-0400', '2020-08-07 15:00:00-0400'

begin_dt = datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S%z')
end_dt = datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S%z')

In [None]:
ds_spec = f"mn-06-138318_165_{begin_dt.strftime('%Y%m%d_%H%M%S%z')}_{end_dt.strftime('%Y%m%d_%H%M%S%z')}"

In [None]:
ds_dir = '../../analytics/data'

These are people/cars/trucks counts from camera with locationId=165 (cameraId=4) ***seen over a period of 60secs***

In [None]:
camera_ts = pd.read_csv(f'../../analytics/data/camera_{ds_spec}.csv')

In [None]:
camera_ts.info()

In [None]:
camera_ts.describe()

In [None]:
camera_ts.head()

In [None]:
camera_ts[camera_ts['people'] == 0].shape

In [None]:
camera_ts[camera_ts['people'] > 0].shape

Let's make camera data continuous

In [None]:
seconds_range = pd.date_range(begin_dt,end_dt,freq='S')

In [None]:
seconds_df = pd.DataFrame(seconds_range, columns=['full_idx'])

In [None]:
seconds_df

In [None]:
camera_ts['idx'] = camera_ts['timestamp'].apply(lambda ts: datetime.fromtimestamp(ts, tz=begin_dt.tzinfo))

In [None]:
camera_ts['idx']

In [None]:
dataset_df = pd.merge_ordered(seconds_df, camera_ts, how='left', left_on='full_idx', right_on='idx')

In [None]:
dataset_df

In [None]:
dataset_df['people'][dataset_df['people'].isna()] = 0
dataset_df['cars'][dataset_df['cars'].isna()] = 0
dataset_df['trucks'][dataset_df['trucks'].isna()] = 0

In [None]:
dataset_df

The idea is to align aggregated camera data (new people detected as a change in the number of people between two consecutive timestamped frames) to observational data (# of unique people's devices in 60 sec time cells).

For that we let's consider the difference between the people counts from the current and the previous frames. Positive difference means new people entered a frame. If it is < 0, it means they left the frame (0 means people "swapped", but on a second level resolution it's unlikely)

So if `d` is the difference array and `v` is the people counts array for each of consecutive frames, then `d` over a sequence of frames is calculated as follows:

```
d[0] = v[0] - 0      # initial "previous" value is 0 
d[1] = v[1] - v[0]
d[2] = v[2] - v[1]
d[3] = v[3] - v[2]
```

In [None]:
dataset_df['total'] = dataset_df['people'] + dataset_df['cars']
dataset_df['ppl_delta_cnt'] = dataset_df['total'] - dataset_df['total'].shift(1, fill_value=0)

In [None]:
dataset_df['ppl_delta_cnt'] = dataset_df['people'] - dataset_df['people'].shift(1, fill_value=0)

In [None]:
dataset_df

In [None]:
dataset_df['timestamp'] = dataset_df['full_idx'].apply(lambda dt: int(dt.timestamp()))

In [None]:
dataset_df['ts_cell'] = (dataset_df['timestamp'] // 60) * 60

In [None]:
dataset_df

In [None]:
grouped_df = dataset_df.groupby('ts_cell')

sum_cnt = grouped_df['ppl_delta_cnt'].agg(lambda df: df[df > 0].sum())

ppl_sum_df = pd.DataFrame(sum_cnt)

In [None]:
ppl_sum_df['ts_cell'] = ppl_sum_df.index.values
ppl_sum_df['t'] = ppl_sum_df['ts_cell'].apply(lambda ts: datetime.fromtimestamp(ts))

In [None]:
ppl_sum_df['ma_cnt_cam'] = ppl_sum_df['ppl_delta_cnt'].ewm(span=20, adjust=False).mean()

In [None]:
ppl_sum_df.head()

In [None]:
ppl_sum_df.iloc[:60*48].plot(x='t', y='ma_cnt_cam', figsize=(16, 9))

In [None]:
ppl_sum_df.plot(x='t', y='ma_cnt_cam', figsize=(16, 9))

In [None]:
observation_ts = pd.read_csv(f'{ds_dir}/observations_{ds_spec}.csv')

In [None]:
observation_ts.head()

In [None]:
observation_ts.info()

In [None]:
observation_ts.describe()

In [None]:
observation_ts['t'] = observation_ts['TS_CELL'].apply(lambda ts: datetime.fromtimestamp(ts))

In [None]:
observation_ts['ma_cnt_obs'] = observation_ts['CNT'].ewm(span=20, adjust=False).mean()

In [None]:
observation_ts.plot(x='t', y='ma_cnt_obs', figsize=(16, 9), color='goldenrod')

In [None]:
obs_cam_df = pd.merge(observation_ts, ppl_sum_df, how='outer', right_index=True, left_on='TS_CELL')

In [None]:
obs_cam_df.head()

In [None]:
obs_cam_df[['ma_cnt_obs', 'ma_cnt_cam']].corr()

In [None]:
plot_df = obs_cam_df.melt(id_vars='t_x', value_vars=['ma_cnt_cam', 'ma_cnt_obs'], var_name='dataset', value_name='people_cnt')

In [None]:
sns.lineplot(data=plot_df, x='t_x', y='people_cnt', hue='dataset')

In [None]:
obs_cam_df['ma_cnt_cam_norm'] = obs_cam_df['ma_cnt_cam'] / obs_cam_df['ma_cnt_cam'].max()
obs_cam_df['ma_cnt_obs_norm'] = obs_cam_df['ma_cnt_obs'] / obs_cam_df['ma_cnt_obs'].max()

In [None]:
plot_df_norm = obs_cam_df.melt(id_vars='t_x', value_vars=['ma_cnt_cam_norm', 'ma_cnt_obs_norm'], var_name='dataset', value_name='people_cnt')