In [1]:
# This notebook computes summary statistics for the files in our dataset and stores them in a folder 'Outputs/Summary_statistics'

In [2]:
# We start by importing all the necessary dependencies

import pandas as pd  #The main package we will be using for data analysis. Used to read CSVs/TSVs, work with dataframe objects, and many other things.
import matplotlib.pyplot as plt  #The standard plotting package, used for making nice pretty pictures.


import os  #We might need this to deal with filenames and paths.
import random  #Used to generate random numbers
import numpy as np  #We might need this, it's a pretty standard package. 
import seaborn as sns
from scipy import stats
import glob
import pickle


# This gives us a progress bar for longer computations. 
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Put the location of the data folder on your computer
input_data_location = 'D:/Data/Hangar-Locust/Cleaned/'
subfolder = 'cleaned_2K-locust_16112022/'


output_data_location = 'C:/Users/Mark/Code/Locust-Physics/Outputs/Summary_statistics/'

In [4]:
# Functions needed in the main loop

# This function computes and returns various important metadata we might be interested in: the totalframes, framerate, video length, number of markers
def video_metadata(df):
    

    
    #Calculates the first, last, and total number of frames
    firstframe = df['Frame'].min()
    lastframe = df['Frame'].max()
    totalframes = lastframe - firstframe +1
    
    #print(f'The total number of frames is {totalframes}')
    
    
    firsttime = df['Frame'].idxmin()
    lasttime = df['Frame'].idxmax()


    #Calculates the time difference (remember time is the index in our dataframe) between two consecutive frames. This is the duration of a single frame in seconds
    frameduration = df.query('Frame == 2').index.tolist()[0] - df.query('Frame == 1').index.tolist()[0]

    #Calculates the framerate, in frames per second
    framerate = 1/frameduration
    #print(f'The framerate is {framerate} frames per second')

    #Calculates the total length of the video in seconds
    videolength = totalframes*frameduration
    #print(f'The length of the video is {videolength/60:.2f} minutes')
    
    #Calculates the number of markers
    n_markers = int((len(df.columns)-1)/3)
    
    #Calculates the min and max values for the x, y, and z coordinates
    #First subset the columns into x, y, and z 
    X_subset = [column for column in list(df.columns) if 'X' in column]
    Y_subset = [column for column in list(df.columns) if 'Y' in column]
    Z_subset = [column for column in list(df.columns) if 'Z' in column]


    x_min = np.nanmin(df[X_subset].iloc[:, :].values)
    x_max = np.nanmax(df[X_subset].iloc[:, :].values)
    y_min = np.nanmin(df[Y_subset].iloc[:, :].values)
    y_max = np.nanmax(df[Y_subset].iloc[:, :].values)
    z_min = np.nanmin(df[Z_subset].iloc[:, :].values)
    z_max = np.nanmax(df[Z_subset].iloc[:, :].values)
    
    
    
    
    return totalframes, framerate, videolength, n_markers, x_min, x_max, y_min, y_max, z_min, z_max


#This function computes how much of the average marker is null
def null_metadata(df):
    # How many markers do we have?
    n_markers = (len(df.columns)-1)/3
    #print(f'The number of markers is {n_markers:.0f}')
    
    
    mean_null = df.isnull().sum().mean()*100/len(df)
    
    return mean_null

In [5]:
# Main loop: For each file in subfolder, load the dataframe and run the functions above. Then, save results as a pickle file. 
# Also save any required images

os.chdir(input_data_location+subfolder)
for file in tqdm(glob.glob("*.tsv")):
    df = pd.read_csv(input_data_location + subfolder + file, sep='\t', index_col='Time')
    
    # Create a new dictionary to store all the summary statistics which will later be saved in pickle file format
    summary_stats = {}
    
    # Store all the video metadata
    summary_stats['totalframes'] = video_metadata(df)[0]
    summary_stats['framerate'] = video_metadata(df)[1]
    summary_stats['videolength'] = video_metadata(df)[2]
    summary_stats['n_markers'] = video_metadata(df)[3]
    summary_stats['x_min'] = video_metadata(df)[4]
    summary_stats['x_max'] = video_metadata(df)[5]
    summary_stats['y_min'] = video_metadata(df)[6]
    summary_stats['y_max'] = video_metadata(df)[7]
    summary_stats['z_min'] = video_metadata(df)[8]
    summary_stats['z_max'] = video_metadata(df)[9]

    
    # Store the mean null
    summary_stats['mean_null'] = null_metadata(df)
    
    
    # Save the data as a pickle file
    with open(output_data_location + file[:-4] + '.pkl', 'wb') as f:
        pickle.dump(summary_stats, f)
    
    # To open pickle file
    #with open(output_data_location + file[:-4] + '.pkl', 'rb') as f:
    #    summary_stats = pickle.load(f)
    
    
    # Save the plot showing the distribution of markers based on what percentage of their data is null
    ax = (df.isnull().sum()*100/len(df)).plot.hist()
    plt.title(f'Distribution of markers based on what percentage of data is null \n File: {file}')
    ax.set(xlabel="Percentage of data which is null", ylabel="Count")
    plt.savefig(output_data_location + file[:-4] + '.png')
    plt.close()

  0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
#print(f'The average marker is missing {null_metadata(df):.1f}% of its data')

In [7]:
# Save dictionary as pickle file

    
