In [34]:
import pandas as pd
import os
import json
import pprint
from feature.feature_concat import *
from google.cloud import storage
from feature.setup import *
from dotenv import load_dotenv
from google.cloud import storage
import json

# Load environment variables from the .env file
load_dotenv("amia.env", override=True)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(os.getcwd(), os.environ.get("SERVICE_ACCOUNT_PATH"))

# load the covid dataset
covid = pd.read_csv("input/Yawen-Colonoscopy-Covid Data files/covid-19/complete_covid19_video_classification_set.csv")
covid_list = covid['video id'].values.tolist()
len(covid_list)

def check_video_exists(bucket_name, video_id):
    object_name = video_id + '.mp4'
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(object_name)
    return blob.exists()

# extract covid dataset
def run_pipeline(id):
    print("\n\n\nRunning pipeline on: ", id)
    if id in status_dict.keys() and  status_dict[id] == 'SUCCESS':
        pass
    try:
        if not check_video_exists(os.environ.get("VIDEO_BUCKET_NAME"), id):
            pipeline_setup(id)
        df_test = feature_extraction(id)
        with open("temp/covid-feature/" + id + ".json", "w") as outfile:
            json.dump(df_test, outfile)
        status_dict[id] = 'SUCCESS'
        print("SUCCESSFULLY EXTRACT : ", id)   
        
    except SetupError as e:
        print(e.message)
        status_dict[id] = 'SETUP ERROR'

    except FeatureError as e:
        print(e.message)
        status_dict[id] = 'FEATURE ERROR'  
            
# load status_dict
with open("temp/status.json", "r") as f:
    status_dict = json.load(f)
    
print('total video numbers: ', len(covid_list))
print('already extracted: ', len(status_dict))

total video numbers:  304
already extracted:  250


In [35]:
extracted_index = [covid_list.index(id) for id in list(status_dict.keys())]
print(max(extracted_index))

249


In [36]:
from collections import Counter
print(Counter(status_dict.values()))

Counter({'SUCCESS': 185, 'FEATURE ERROR': 51, 'SETUP ERROR': 14})


In [37]:
# save status
with open("temp/status.json", "w") as f:
    json.dump(status_dict, f)

In [38]:
for i in range(215, 250):
    # load status
    with open("temp/status.json", "r") as f:
        status_dict = json.load(f)
    
    # extraction
    print()
    print("extracting: ", i)
    run_pipeline(covid_list[i])
    print("finishing: ", i)
    print()
    
    # save status
    with open("temp/status.json", "w") as f:
        json.dump(status_dict, f)


extracting:  215



Running pipeline on:  Imz6M1-X11U

STARTING FEATURE EXTRACTION ON:  Imz6M1-X11U
EXTRACTING RAW METADATA FEATURES!
EXTRACTING VIDEO AND AUDIO FEATURES!
Processing video for shot change annotations:
Finished processing.
Processing video for object annotations.
Finished processing.

Processing video for text detection.
EXTRACTING NLP FEATURES!
HAVING TROBULE WITH NLP FEATURESImz6M1-X11U
finishing:  215


extracting:  216



Running pipeline on:  6xeuAavcCGQ

STARTING FEATURE EXTRACTION ON:  6xeuAavcCGQ
EXTRACTING RAW METADATA FEATURES!
EXTRACTING VIDEO AND AUDIO FEATURES!
Processing video for shot change annotations:
Finished processing.
Processing video for object annotations.
Finished processing.

Processing video for text detection.
EXTRACTING NLP FEATURES!
CONCATENATE ALL FEATURES!
SUCCESSFULLY EXTRACT :  6xeuAavcCGQ
finishing:  216


extracting:  217



Running pipeline on:  QjEfy54R3mc

STARTING FEATURE EXTRACTION ON:  QjEfy54R3mc
EXTRACTING RAW METADATA FEATURE

# Test

first test:
- index: 0 - 100, 101 videos
- unsuccessful downloaded: 

In [None]:
with open('temp/feature_error.txt', 'r') as f:
    lines = [line.strip() for line in f.readlines()]
for id in lines:
    run_pipeline(id)

In [120]:
for id in dif1:
    with open('temp/feature_error.txt', 'a') as f:
        f.write(id +'\n') 

In [102]:
import os

folder_path = 'temp/covid-feature'  # Replace with the path to your folder
file_names = []

# Get all file names in folder and append to list
for filename in os.listdir(folder_path):
    file_names.append(filename.split(".")[0])

print(len(file_names))

83


In [103]:
from google.cloud import storage
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"
    name = []
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    # Note: The call returns a response only when the iterator is consumed.
    for blob in blobs:
        # print(blob.name)
        name.append(blob.name.split(".")[0])
    
    return name
gcp_list = list_blobs(os.environ.get("AUDIO_BUCKET_NAME"))
len(gcp_list)


94

In [110]:
# those which cannot be downlaoded
dif = [item for item in iterate_list if item not in gcp_list]
len(dif)

7

In [111]:
# those which cannot be extracted metadata
dif1 = [item for item in gcp_list if item not in file_names]
len(dif1)

11

In [71]:
# test 1, 4-5 minutes
id = "d554SXwdTRA"
#keyword = "test"
#pipeline_setup(id)
df_test = feature_extraction(id)
pprint.pprint(df_test)
with open("temp/covid-feature/" + id + ".json", "w") as outfile:
    json.dump(df_test, outfile)

EXTRACTING METADATA & VIDEO & TRANSCRIPTION FEATURES!
extracting the id:  d554SXwdTRA
Retrieving accTag for:  https://www.youtube.com/watch?v=d554SXwdTRA
error in extracting accTag

Processing video for shot change annotations:

Finished processing.

Processing video for object annotations.

Finished processing.


Processing video for text detection.
EXTRACTING NLP FEATURES!
CONCATENATE ALL FEATURES!
{'accreditationTag': 0,
 'channel_subscribers': '7100',
 'desc_act': 1,
 'desc_ari': 21.214000000000006,
 'desc_mer': 3,
 'desc_sen': 1,
 'desc_sum': 0,
 'desc_trans': 0,
 'desc_uni': 24,
 'desc_words': 25,
 'duration': 136.0,
 'hasDescription': 1,
 'hasTags': 0,
 'id': 'd554SXwdTRA',
 'num_of_objects': 35,
 'num_of_shots': 3,
 'num_of_tags': 0,
 'publish_days': 1090,
 'tags': '',
 'text_confidence': 0.9253388831887064,
 'tran_act': 49,
 'tran_ari': 0,
 'tran_mer': 12,
 'tran_sen': 0,
 'tran_sum': 0,
 'tran_trans': 1,
 'tran_uni': 153,
 'tran_words': 318,
 'transcription confidence': 0.967