# Revised AMIA project, based on colononscopy data

In [9]:
import pandas as pd
import os
import json
import pprint
from google.cloud import storage
from dotenv import load_dotenv 
import redis

# from feature.feature_concat import *
from feature.video import *
from feature.audio import *
from feature.metadata import *
from feature.setup import *
from feature.nlp import *

# Load environment variables from the .env file
load_dotenv("ytbvideoanalytics2022.env", override=True)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(os.getcwd(), os.environ.get("SERVICE_ACCOUNT_PATH"))

# load the covid dataset
colon = pd.read_csv("input/Yawen-Colonoscopy-Covid Data files/colonoscopy/complete_colonoscopy_classification_set.csv")
colon_list = colon['id'].values.tolist()
print(len(colon_list))
print(len(set(colon_list)))

285
285


In [10]:
# Connect to the Redis server
redis_client = redis.Redis(host='localhost', port=6379, db=0)

In [11]:
# Define a function to check whether setup process has completed successfully for a specific video
def check_setup_status(video_id):
    # Check if the video setup status is already cached in Redis
    if redis_client.hexists(video_id, 'setup_status'):
        return redis_client.hget(video_id, 'setup_status').decode('utf-8') == 'success'
    else:
        # Perform the setup process and cache the setup status in Redis
        try:
            download(video_id)
            redis_client.hset(video_id, 'setup_status', 'success')
            return True
        except DownloadError as e:
            redis_client.hset(video_id, 'setup_status', 'error')
            return False

In [14]:
# Define a function to extract features for a single video and store it to redis
def extract_features(video_id):
    try: 
        youtube = youtube_authenticate(os.environ.get("OAUTH_CREDENTIAL_PATH"))
        
    except AuthenticateError:
        print('Authenticate Error!')
        return
    
    # Check whether the setup process has completed successfully for this video
    if not check_setup_status(video_id):
        print(f'Error: setup process failed for video {video_id}')
        return
    
    # collect metadata
    if redis_client.hget(video_id, 'metadata_status') == b'success':
        description = redis_client.hget(video_id, 'description').decode('utf8')
    else:
        try: 
            metadata = metadata_features(youtube = youtube, video_id = video_id)
            description = metadata['description']
            redis_client.hset(video_id, 'metadata_status', 'success')
            for k, v in metadata.items():
                redis_client.hset(video_id, k, v)
        except MetadataError as e:
            redis_client.hset(video_id, 'metadata_status', 'fail')
            print(e.message)
        
    # collect video features
    if redis_client.hget(video_id, 'video_status') == b'success':
       pass
    else:
        try: 
            gcs_video_path = os.path.join("gs://", os.environ.get("VIDEO_BUCKET_NAME"), video_id +".mp4")
            video = analyze_by_path(gcs_video_path)
            redis_client.hset(video_id, 'video_status', 'success')
            for k, v in video.items():
                redis_client.hset(video_id, k, v)
        except:
            redis_client.hset(video_id, 'video_status', 'fail')
            print('video fails')
    
    # collect audio features
    if redis_client.hget(video_id, 'audio_status') == b'success':
        transcription = redis_client.hget(video_id, 'transcription').decode('utf8')
    else:
        try:
            gcs_audio_path = os.path.join("gs://", os.environ.get("AUDIO_BUCKET_NAME"), video_id +".wav")
            audio = transcribe_gcs(gcs_audio_path)
            transcription = audio['transcription']
            redis_client.hset(video_id, 'audio_status', 'success')
            for k, v in audio.items():
                redis_client.hset(video_id, k, v)
        except:
            redis_client.hset(video_id, 'audio_status', 'fail')  
            print('audio fails')
        
    # collect nlp features
    if redis_client.hget(video_id, 'nlp_status') == b'success':
        pass
    else:
        try: 
            nlp = dict()
            desc_nlp = desc_nlp_feature(description)
            trans_nlp = trans_nlp_feature(transcription)
            nlp.update(desc_nlp)
            nlp.update(trans_nlp)
            redis_client.hset(video_id, 'nlp_status', 'success')
            for k, v in nlp.items():
                redis_client.hset(video_id, k, v)
        except:
            redis_client.hset(video_id, 'nlp_status', 'fail')
            print('nlp fails')           

In [16]:
extract_features('CjQoe2FtBwg')

Processing video for shot change annotations:
Finished processing.
Processing video for object annotations.
Finished processing.

Processing video for text detection.


In [17]:
def get_video_info(video_id):
    # Retrieve the video information from Redis
    video_info = redis_client.hgetall(video_id)

    # Convert the values from bytes to strings and deserialize the JSON values
    result = dict()
    for key, value in video_info.items():
        result[key.decode('utf-8')] = value.decode('utf-8')
    
    return result

In [18]:
get_video_info('CjQoe2FtBwg')

{'desc_words': '650',
 'tran_uni': '191',
 'num_of_objects': '349',
 'tags': '',
 'desc_ari': '12.127900233100235',
 'publish_days': '3211',
 'desc_sum': '0',
 'duration': '176.0',
 'tran_sen': '0',
 'audio_status': 'success',
 'title': 'Suspicious Polyp Removed with CELS Procedure | UCLA Colorectal Surgery',
 'tran_trans': '0',
 'tran_act': '57',
 'setup_status': 'success',
 'tran_mer': '2',
 'desc_uni': '298',
 'metadata_status': 'success',
 'channel_subscribers': '510000',
 'desc_act': '81',
 'desc_trans': '3',
 'last_time_used': '0.1816272735595703',
 'num_of_tags': '9',
 'text_confidence': '0.8630676534440782',
 'tran_ari': '0',
 'num_of_shots': '18',
 'transcription': "58 year old Robert Greenwald has a large polyp in a difficult to accept part of his colon that doctors cannot remove a routine colonoscopy for I went to a local surgeon who wanted to remove a part of my colon and that's when I went home and I started researching on the internet and I came across the paper talking a

In [30]:
extracted_keys = [item.decode('utf-8') for item in redis_client.scan(match='*', count=1000)[1]]
unextracted_keys = [item for item in colon_list if item not in extracted_keys]
print(len(unextracted_keys))

0


In [29]:
# Use SCAN command to retrieve all hash keys in database
keys = redis_client.scan(match='*', count=1000)[1]

# Loop through hash keys and retrieve four fields for each hash
i = 0
for key in keys:
    # Check if key is a hash
    if redis_client.type(key) == b'hash':
        i += 1
        # Retrieve values for four fields
        values = redis_client.hmget(key, 'setup_status', 'metadata_status', 'video_status', 'audio_status', 'nlp_status',)
        # Print hash key and values for four fields
        print('Hash', key, 'fields 1-4:', values)
print(i)


Hash b'CjQoe2FtBwg' fields 1-4: [b'success', b'success', b'success', b'success', b'success']
Hash b'_BTcU2SRT2k' fields 1-4: [b'success', b'success', b'success', b'success', b'success']
Hash b'2cexFOCrGVU' fields 1-4: [b'success', b'success', b'success', b'success', b'success']
Hash b'HqbAYQKrCTs' fields 1-4: [b'success', b'success', b'fail', b'success', b'success']
Hash b'j9vHsuAxTOU' fields 1-4: [b'success', b'success', b'fail', b'fail', b'fail']
Hash b'yc1AxrrXLJ0' fields 1-4: [b'success', b'success', b'fail', b'success', b'success']
Hash b'Eqs2HLQdLEY' fields 1-4: [b'error', None, None, None, None]
Hash b'WWbeQCUh418' fields 1-4: [b'success', b'success', b'success', b'success', b'success']
Hash b'sgDUxxBUPGI' fields 1-4: [b'success', b'success', b'fail', b'success', b'success']
Hash b'FKtB8Uyu22w' fields 1-4: [b'success', b'success', b'success', b'success', b'success']
Hash b'SNS4uE3e_0o' fields 1-4: [b'success', b'success', b'fail', b'success', b'success']
Hash b'7bOzURk7wp4' fiel