# Metadata Tracker

This notebook keeps track of what mp4 files have been processed and the degree to which they have been processed.

Copyright (c) 2022 Harshith Mohan Kumar

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import os
import sys
import glob
import warnings

# Reload changed modules every time
%load_ext autoreload
%autoreload 2

In [2]:
sys.path.insert(0, '/mnt/rds/redhen/gallina/home/hxm471/'\
                'RedHenLab-Multimodal_TV_Show_Segmentation/mtvss/')
import constants as const

In [3]:
# Print out hxm471 gallina home path
const.H_GAL_HOME_PATH

'/mnt/rds/redhen/gallina/home/hxm471/'

## Populate DataFrame

In [4]:
# Crate dictonary to store values
data = {'File_Name':[],'Stage-1-Music':[],'Stage-2-Images':[]}
# Get path of splits within gallina home
splits_folder = 'splits/tmp/'

In [5]:
for file in glob.glob(const.H_GAL_HOME_PATH+splits_folder+'*_feats.csv'):
    file_basename = os.path.basename(file)[:-10]
    data['File_Name'].append(file_basename)
    data['Stage-1-Music'].append('Not-Done')
    data['Stage-2-Images'].append('Not-Done')
    
len(data['File_Name'])

5676

### Stage-1 Check

In [6]:
# Checking if stage one is complete by determing features files
for feats in glob.glob(const.H_GAL_HOME_PATH+splits_folder+'*_loge.npy'):
    split_f = feats[:-4].split('_')
    
    file_basename = os.path.basename('_'.join(split_f[:-3]))

    try:
        idx = data['File_Name'].index(file_basename)
        if(split_f[-3] is not None and split_f[-2] is not None):
            if(int(split_f[-3])==int(split_f[-2])):
                data['Stage-1-Music'][idx]='Done'
                continue
    except Exception as e:
        print(e)
        print("\nOn file:",split_f)
        data['Stage-1-Music'][idx]='None'
            
len(data['Stage-1-Music'])

invalid literal for int() with base 10: 'None'

On file: ['/mnt/rds/redhen/gallina/home/hxm471/splits/tmp/2003-01-31', '0000', 'US', '00009298', 'V2', 'VHSP11', 'MB6', 'H8', 'DJ', '0', 'None', 'loge']
invalid literal for int() with base 10: 'None'

On file: ['/mnt/rds/redhen/gallina/home/hxm471/splits/tmp/2005-10-07', '0000', 'US', '00001279', 'V3', 'VHSP8', 'MB2', 'H9', 'KE', 'BE', '0', 'None', 'loge']
invalid literal for int() with base 10: 'None'

On file: ['/mnt/rds/redhen/gallina/home/hxm471/splits/tmp/2003-05-07', '0000', 'US', '00009610', 'V3', 'VHS15', 'MB4', 'H12', 'GG', '0', 'None', 'loge']


5676

### Stage-2 Check

In [13]:
# Checking if stage two is complete by checking if image_features.npy file exists
for feats in glob.glob(const.H_GAL_HOME_PATH+splits_folder+'*image_features.npy'):
    split_f = feats[:-4].split('_')
    
    file_basename = os.path.basename('_'.join(split_f[:-2]))
    
    try:
        idx = data['File_Name'].index(file_basename)
        data['Stage-2-Images'][idx]='Done'
        continue
    except Exception as e:
        print(e)
        print("\nOn file:",split_f)
        data['Stage-2-Images'][idx]='None'
            
len(data['Stage-2-Images'])

5676

## Create DataFrame to hold status

In [14]:
meta_status_df = pd.DataFrame.from_dict(data=data)
meta_status_df

Unnamed: 0,File_Name,Stage-1-Music,Stage-2-Images
0,1996-10-02_0000_US_00012227_V2_VHSP19_MB26_E12_SC,Done,Done
1,2006-06-05_0000_US_00002734_V1_MB3_VHS3_H9_JK,Done,Done
2,2006-08-07_0000_US_00003418_V3_M1_VHS9_H3_JN_2,Done,Done
3,2000-02-29_0000_US_00012349_V2_VHS14_MB13_H1_PS,Done,Done
4,2004-10-27_0000_US_00005295_V3_VHSP11_M2_H4_MS_BE,Done,Done
...,...,...,...
5671,2003-11-04_0000_US_00015075_V3_VHS28_MB3_E3_CS,Done,Done
5672,2002-05-03_0000_US_00012092_V3_VHS1_MB3_E3_AA,Done,Not-Done
5673,2004-09-03_0000_US_00004889_V1_VHS7_MB7_H11_JN,Done,Done
5674,1997-09-24_0000_US_00017772_V1_VHSP21_MM1_H6_JS,Done,Not-Done


In [15]:
meta_status_df['Stage-1-Music'].value_counts()

Done        5613
Not-Done      60
None           3
Name: Stage-1-Music, dtype: int64

In [16]:
meta_status_df['Stage-2-Images'].value_counts()

Done        4279
Not-Done    1397
Name: Stage-2-Images, dtype: int64

In [17]:
meta_status_df

Unnamed: 0,File_Name,Stage-1-Music,Stage-2-Images
0,1996-10-02_0000_US_00012227_V2_VHSP19_MB26_E12_SC,Done,Done
1,2006-06-05_0000_US_00002734_V1_MB3_VHS3_H9_JK,Done,Done
2,2006-08-07_0000_US_00003418_V3_M1_VHS9_H3_JN_2,Done,Done
3,2000-02-29_0000_US_00012349_V2_VHS14_MB13_H1_PS,Done,Done
4,2004-10-27_0000_US_00005295_V3_VHSP11_M2_H4_MS_BE,Done,Done
...,...,...,...
5671,2003-11-04_0000_US_00015075_V3_VHS28_MB3_E3_CS,Done,Done
5672,2002-05-03_0000_US_00012092_V3_VHS1_MB3_E3_AA,Done,Not-Done
5673,2004-09-03_0000_US_00004889_V1_VHS7_MB7_H11_JN,Done,Done
5674,1997-09-24_0000_US_00017772_V1_VHSP21_MM1_H6_JS,Done,Not-Done


## Store DataFrame

In [18]:
store_path = '/mnt/rds/redhen/gallina/home/hxm471/'\
                'RedHenLab-Multimodal_TV_Show_Segmentation/mtvss/data/tmp/metadata_tracker.csv'

In [19]:
meta_status_df.to_csv(store_path)