In [403]:
import pandas as pd
import matplotlib as plt
import os
import warnings

from pymongo import MongoClient
from bson.son import SON
from dotenv import load_dotenv

warnings.filterwarnings("ignore")

In [404]:
#Only run once
#!pip install -r requirements.txt
#!dotenv set CONNECTION_STRING "your_connection_string"

In [405]:
CONNECTION_STRING = os.getenv('CONNECTION_STRING')

def get_database():
   CONNECTION_STRING = connection_str
   client = MongoClient(CONNECTION_STRING)
   return client['user_shopping_list']

if __name__ == "__main__":   

   # Get the database
   animl_db = get_database()

images_df = pd.DataFrame(list(animl_db.client['animl-prod']['images'].find()))
# Local Test Read
# images_df = pd.read_csv("ab7217c1ec872366f2832d763a4fd8f5.csv")

images_df['burstId'] = None
images_df['dateTimeOriginal'] = images_df['dateTimeOriginal'].apply(pd.to_datetime)
images_df.sort_values('dateTimeOriginal', inplace=True)

In [406]:
#Pull out all possible dep_ids
deploymentIds = np.unique(images_df['deploymentId'].values) 

In [None]:
def eval_burst(burst, seconds_gap):
    """
    Returns True or False if burst contains images with timestamps < 2 seconds apart
    Input:
    burst(list[datetime64][3]): list of image timestamps
    seconds_gap(int): threshold for gap between images in burst (seconds)
    Output:
    boolean: images are likely a burst (True) or not (False)
    """
    
    return max(np.diff(window['dateTimeOriginal'].values)).astype('float64') / 1e9 <= seconds_gap #seconds max between images

In [None]:
#Iterate over deployments and cluster
for deploymentId in deploymentIds:
    dep_df = images_df.loc[images_df['deploymentId'] == deploymentId]
    dep_df.sort_values('dateTimeOriginal', ascending=True, inplace=True)
    dep_df_indices = dep_df.index.to_list()
    
    #For each deployment, estimate the burst gap: (ceil ( mean ( gap lenghts shorter than 1 minute )))
    dep_median_gap = np.ceil(np.mean(list(filter(lambda x: x < 60, np.diff(dep_df['dateTimeOriginal']).astype('float64')/1e9))))
    
    #sliding window 
    if not (dep_df.shape[0] > 3):
        print(f"{deploymentId} does not contain enough images to burst cluster... skipping...")
        continue
        
    start, end = 0, 2
    
    while end < dep_df.shape[0]-1:
        window_i = [_i for _i in range(start, end+1)]
        window = dep_df.iloc[window_i]
        
        if eval_burst(window['dateTimeOriginal'].values, max(2, dep_median_gap)):
            #Record Burst ID in Metadata Dataframe
            burstId = hash("".join(window['_id'].values))
            images_df['burstId'].loc[dep_df_indices[start:end+1]] = burstId
            
            start, end = start + 3, end + 3 #shift window
        else:
            start, end = start + 1, end + 1

In [None]:
#Assess Non-Clustered Counts per Deployment
burst_clustered_proportions = images_df.groupby(['deploymentId', 'cameraId', 'projectId'])['burstId'].agg([lambda x: 1 - (sum(x.isnull()) / x.shape[0]), lambda y: y.shape[0]]).reset_index()
burst_clustered_proportions.rename(columns={'<lambda_0>':'burst_proportion', '<lambda_1>':'image_count'})