In [36]:
import pandas as pd
import os
import json
import pprint
from feature.feature_concat import *
from google.cloud import storage
from feature.setup import *
from dotenv import load_dotenv
from google.cloud import storage
import json

# Load environment variables from the .env file
load_dotenv("amia.env", override=True)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(os.getcwd(), os.environ.get("SERVICE_ACCOUNT_PATH"))

# load the covid dataset
covid = pd.read_csv("input/Yawen-Colonoscopy-Covid Data files/covid-19/complete_covid19_video_classification_set.csv")
covid_list = covid['video id'].values.tolist()
len(covid_list)

def check_video_exists(bucket_name, video_id):
    object_name = video_id + '.mp4'
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(object_name)
    return blob.exists()

# extract covid dataset
def run_pipeline(id):
    print("\n\n\nRunning pipeline on: ", id)
    if id in status_dict.keys() and status_dict[id] == 'SUCCESS':
        pass
    try:
        if not check_video_exists(os.environ.get("VIDEO_BUCKET_NAME"), id):
            pipeline_setup(id)
        df_test = feature_extraction(id)
        with open("temp/covid-feature/" + id + ".json", "w") as outfile:
            json.dump(df_test, outfile)
        status_dict[id] = 'SUCCESS'
        print("SUCCESSFULLY EXTRACT : ", id)   
        
    except SetupError as e:
        print(e.message)
        status_dict[id] = 'SETUP ERROR'

    except FeatureError as e:
        print(e.message)
        status_dict[id] = 'FEATURE ERROR'  
            
# load status_dict
with open("temp/status.json", "r") as f:
    status_dict = json.load(f)
    
print('total video numbers: ', len(covid_list))
print('already extracted: ', len(status_dict))

total video numbers:  304
already extracted:  304


In [2]:
extracted_index = [covid_list.index(id) for id in list(status_dict.keys())]
print(max(extracted_index))

252


In [45]:
from collections import Counter
print(Counter(status_dict.values()))

Counter({'SUCCESS': 292, 'SETUP ERROR': 12})


In [23]:
dif = [item.split(".")[0] for item in success_list if item.split(".")[0] not in success_list_1]
for id in dif:
    status_dict[id] = 'SUCCESS'

In [4]:
# save status
with open("temp/status.json", "w") as f:
    json.dump(status_dict, f)

In [None]:
for i in range(250, 304):
    # load status
    with open("temp/status.json", "r") as f:
        status_dict = json.load(f)
    
    # extraction
    print()
    print("extracting: ", i)
    run_pipeline(covid_list[i])
    print("finishing: ", i)
    print()
    
    # save status
    with open("temp/status.json", "w") as f:
        json.dump(status_dict, f)

In [46]:
problem_list = [key for key, value in status_dict.items() if value == "SETUP ERROR"]
id_list = [covid_list.index(item) for item in problem_list]
id_list

[0, 2, 28, 39, 94, 96, 99, 134, 171, 175, 218, 241]

In [None]:
for i in id_list:
    # load status
    with open("temp/status.json", "r") as f:
        status_dict = json.load(f)
    
    # extraction
    print()
    print("extracting: ", i)
    run_pipeline(covid_list[i])
    print("finishing: ", i)
    print()
    
    # save status
    with open("temp/status.json", "w") as f:
        json.dump(status_dict, f)

# Test

first test:
- index: 0 - 100, 101 videos
- unsuccessful downloaded: 

In [None]:
with open('temp/feature_error.txt', 'r') as f:
    lines = [line.strip() for line in f.readlines()]
for id in lines:
    run_pipeline(id)

In [120]:
for id in dif1:
    with open('temp/feature_error.txt', 'a') as f:
        f.write(id +'\n') 

In [102]:
import os

folder_path = 'temp/covid-feature'  # Replace with the path to your folder
file_names = []

# Get all file names in folder and append to list
for filename in os.listdir(folder_path):
    file_names.append(filename.split(".")[0])

print(len(file_names))

83


In [103]:
from google.cloud import storage
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"
    name = []
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    # Note: The call returns a response only when the iterator is consumed.
    for blob in blobs:
        # print(blob.name)
        name.append(blob.name.split(".")[0])
    
    return name
gcp_list = list_blobs(os.environ.get("AUDIO_BUCKET_NAME"))
len(gcp_list)


94

In [110]:
# those which cannot be downlaoded
dif = [item for item in iterate_list if item not in gcp_list]
len(dif)

7

In [111]:
# those which cannot be extracted metadata
dif1 = [item for item in gcp_list if item not in file_names]
len(dif1)

11

In [None]:
# test 1, 4-5 minutes
id = "d554SXwdTRA"
#keyword = "test"
#pipeline_setup(id)
df_test = feature_extraction(id)
pprint.pprint(df_test)
with open("temp/covid-feature/" + id + ".json", "w") as outfile:
    json.dump(df_test, outfile)

In [16]:
# refactor acc tag function
import requests
import json
import os
from bs4 import BeautifulSoup
def get_acc_tag(videoID):
    try:
        search_url = "https://www.youtube.com/watch?v="
        video_url = search_url + videoID
        response = requests.get(video_url)
        if "From an accredited hospital" in response.text:
            acc_tag = 1
        else:
            acc_tag = 0
    except:
        acc_tag = 0

    return acc_tag

folder_path = 'temp/covid-feature'
for filename in os.listdir('temp/covid-feature'):
    file_path = os.path.join(folder_path, filename)
    id = filename.split(".")[0]
    with open(file_path, "r") as json_file:
        json_data = json.load(json_file)
    json_data["accreditationTag"] = get_acc_tag(id)
    # Save the updated JSON data back to the file
    with open(file_path, "w") as json_file:
        json.dump(json_data, json_file)

In [15]:
import requests
# YouTube URL
url = "https://www.youtube.com/watch?v=eX9E5gHmdGE&ab_channel=MayoClinic"
# Send GET request
response = requests.get(url)
text = response.text
"From an accredited hospital" in text

True

In [None]:
get_acc_tag()