# Metadata Tracker

This notebook keeps track of what mp4 files have been processed and the degree to which they have been processed.

Copyright (c) 2022 Harshith Mohan Kumar

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import os
import sys
import glob
import warnings

# Reload changed modules every time
%load_ext autoreload
%autoreload 2

In [2]:
sys.path.insert(0, '/mnt/rds/redhen/gallina/home/hxm471/'\
                'RedHenLab-Multimodal_TV_Show_Segmentation/mtvss/')
import constants as const

In [3]:
# Print out hxm471 gallina home path
const.H_GAL_HOME_PATH

'/mnt/rds/redhen/gallina/home/hxm471/'

## Populate DataFrame

In [4]:
# Crate dictonary to store values
data = {'File_Name':[],'Stage-1-Music':[],'Stage-2-Images':[]}
# Get path of splits within gallina home
splits_folder = 'splits/tmp/'

In [5]:
for file in glob.glob(const.H_GAL_HOME_PATH+splits_folder+'*_feats.csv'):
    file_basename = os.path.basename(file)[:-10]
    data['File_Name'].append(file_basename)
    data['Stage-1-Music'].append('Not-Done')
    data['Stage-2-Images'].append('Not-Done')
    
len(data['File_Name'])

4404

### Stage-1 Check

In [6]:
# Checking if stage one is complete by determing features files
for feats in glob.glob(const.H_GAL_HOME_PATH+splits_folder+'*_loge.npy'):
    split_f = feats[:-4].split('_')
    
    file_basename = os.path.basename('_'.join(split_f[:-3]))

    try:
        idx = data['File_Name'].index(file_basename)
        if(split_f[-3] is not None and split_f[-2] is not None):
            if(int(split_f[-3])==int(split_f[-2])):
                data['Stage-1-Music'][idx]='Done'
                continue
    except Exception as e:
        print(e)
        print("\nOn file:",split_f)
        data['Stage-1-Music'][idx]='None'
            
len(data['Stage-1-Music'])

invalid literal for int() with base 10: 'None'

On file: ['/mnt/rds/redhen/gallina/home/hxm471/splits/tmp/2003-01-31', '0000', 'US', '00009298', 'V2', 'VHSP11', 'MB6', 'H8', 'DJ', '0', 'None', 'loge']
invalid literal for int() with base 10: 'None'

On file: ['/mnt/rds/redhen/gallina/home/hxm471/splits/tmp/2005-10-07', '0000', 'US', '00001279', 'V3', 'VHSP8', 'MB2', 'H9', 'KE', 'BE', '0', 'None', 'loge']
invalid literal for int() with base 10: 'None'

On file: ['/mnt/rds/redhen/gallina/home/hxm471/splits/tmp/2003-05-07', '0000', 'US', '00009610', 'V3', 'VHS15', 'MB4', 'H12', 'GG', '0', 'None', 'loge']


4404

### Stage-2 Check

In [7]:
image_folder = 'image_dataset/'

In [8]:
# Checking if stage two is complete by determing jpeg files
for feats in glob.glob(const.H_GAL_HOME_PATH+image_folder+'/title/*'):
    split_f = feats[:-4].split('_')
    
    file_basename = os.path.basename('_'.join(split_f[:-4]))

    try:
        idx = data['File_Name'].index(file_basename)
        data['Stage-2-Images'][idx]='Done'
    except Exception as e:
        print(e)
        print("\nOn file:",split_f)

for feats in glob.glob(const.H_GAL_HOME_PATH+image_folder+'/commercial/*'):
    split_f = feats[:-4].split('_')
    
    file_basename = os.path.basename('_'.join(split_f[:-4]))

    try:
        idx = data['File_Name'].index(file_basename)
        data['Stage-2-Images'][idx]='Done'
    except Exception as e:
        print(e)
        print("\nOn file:",split_f)
            
len(data['Stage-2-Images'])


'1989-08-04_0000_US_NA024558_V5_VHS40_MB16_H35_JM' is not in list

On file: ['/mnt/rds/redhen/gallina/home/hxm471/image', 'dataset//commercial/1989-08-04', '0000', 'US', 'NA024558', 'V5', 'VHS40', 'MB16', 'H35', 'JM', 'keyframes', '198173', '6611', '6640.']
'1989-08-04_0000_US_NA024558_V5_VHS40_MB16_H35_JM' is not in list

On file: ['/mnt/rds/redhen/gallina/home/hxm471/image', 'dataset//commercial/1989-08-04', '0000', 'US', 'NA024558', 'V5', 'VHS40', 'MB16', 'H35', 'JM', 'keyframes', '55648', '1852', '1861.']
'1989 08 09 0000 US NA024656 V8 VHS35 MB5 H9 LA' is not in list

On file: ['/mnt/rds/redhen/gallina/home/hxm471/image', 'dataset//commercial/1989 08 09 0000 US NA024656 V8 VHS35 MB5 H9 LA', 'keyframes', '21132', '690', '718.']
'1989-08-24_0000_US_NA024901_V9_VHS24_MB2_H24_CM' is not in list

On file: ['/mnt/rds/redhen/gallina/home/hxm471/image', 'dataset//commercial/1989-08-24', '0000', 'US', 'NA024901', 'V9', 'VHS24', 'MB2', 'H24', 'CM', 'keyframes', '12354', '401', '429.']
'1989

4404

## Create DataFrame to hold status

In [9]:
meta_status_df = pd.DataFrame.from_dict(data=data)
meta_status_df

Unnamed: 0,File_Name,Stage-1-Music,Stage-2-Images
0,2006-06-05_0000_US_00002734_V1_MB3_VHS3_H9_JK,Done,Not-Done
1,2006-08-07_0000_US_00003418_V3_M1_VHS9_H3_JN_2,Done,Not-Done
2,2004-10-27_0000_US_00005295_V3_VHSP11_M2_H4_MS_BE,Done,Not-Done
3,2004-09-15_0000_US_00004969_V3_VHS15_M6_H9_CV,Done,Not-Done
4,1997-05-19_0000_US_00016832_V1_VHS25_MB16_H2_NV,Done,Not-Done
...,...,...,...
4399,2003-11-04_0000_US_00015075_V3_VHS28_MB3_E3_CS,Done,Not-Done
4400,2002-05-03_0000_US_00012092_V3_VHS1_MB3_E3_AA,Done,Not-Done
4401,2004-09-03_0000_US_00004889_V1_VHS7_MB7_H11_JN,Done,Not-Done
4402,1997-09-24_0000_US_00017772_V1_VHSP21_MM1_H6_JS,Done,Not-Done


In [10]:
meta_status_df['Stage-1-Music'].value_counts()

Done        4347
Not-Done      54
None           3
Name: Stage-1-Music, dtype: int64

In [11]:
meta_status_df['Stage-2-Images'].value_counts()

Not-Done    4336
Done          68
Name: Stage-2-Images, dtype: int64

In [12]:
meta_status_df

Unnamed: 0,File_Name,Stage-1-Music,Stage-2-Images
0,2006-06-05_0000_US_00002734_V1_MB3_VHS3_H9_JK,Done,Not-Done
1,2006-08-07_0000_US_00003418_V3_M1_VHS9_H3_JN_2,Done,Not-Done
2,2004-10-27_0000_US_00005295_V3_VHSP11_M2_H4_MS_BE,Done,Not-Done
3,2004-09-15_0000_US_00004969_V3_VHS15_M6_H9_CV,Done,Not-Done
4,1997-05-19_0000_US_00016832_V1_VHS25_MB16_H2_NV,Done,Not-Done
...,...,...,...
4399,2003-11-04_0000_US_00015075_V3_VHS28_MB3_E3_CS,Done,Not-Done
4400,2002-05-03_0000_US_00012092_V3_VHS1_MB3_E3_AA,Done,Not-Done
4401,2004-09-03_0000_US_00004889_V1_VHS7_MB7_H11_JN,Done,Not-Done
4402,1997-09-24_0000_US_00017772_V1_VHSP21_MM1_H6_JS,Done,Not-Done


## Store DataFrame

In [13]:
store_path = '/mnt/rds/redhen/gallina/home/hxm471/'\
                'RedHenLab-Multimodal_TV_Show_Segmentation/mtvss/data/tmp/metadata_tracker.csv'

In [14]:
meta_status_df.to_csv(store_path)