In [6]:
!pip install boto3
!pip install yt_dlp
!pip install youtube-transcript-api
!pip install --upgrade google-api-python-client
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [132]:
import boto3
import hashlib
import json
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound
from decimal import Decimal
import yt_dlp
from datetime import datetime
from dotenv import load_dotenv


def current_timestamp():
    """Returns the current timestamp formatted for readability."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'DEVELOPER_KEY': os.getenv('DEVELOPER_KEY'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'SEARCH_CACHE_TABLE': os.getenv('SEARCH_CACHE_TABLE'),
        'RESULTS_TABLE_NAME': os.getenv('RESULTS_TABLE_NAME'),
        'SEARCH_QUERY': os.getenv('SEARCH_QUERY'),
        'MAX_RESULTS': int(os.getenv('MAX_RESULTS', 1)),
        'ORDER': os.getenv('ORDER', 'viewCount'),
        'VIDEO_DURATION': os.getenv('VIDEO_DURATION', 'medium'),
        'PUBLISHED_AFTER': os.getenv('PUBLISHED_AFTER', '2010-01-01T00:00:00Z'),
        'PUBLISHED_BEFORE': os.getenv('PUBLISHED_BEFORE', '2024-12-31T23:59:59Z'),
        'RELEVANCE_LANGUAGE': os.getenv('RELEVANCE_LANGUAGE', 'en'),
        # 'VIDEO_CATEGORY_ID': os.getenv('VIDEO_CATEGORY_ID', '10'),
        'VECTARA_KEY':os.getenv('VECTARA_KEY'),
        'AWS_REGION': os.getenv('AWS_REGION')  # Add AWS region to the environment variables
    }



def youtube_search_all_videos(options):
    """Perform a search on YouTube Data APIand return all videos based on the options, with caching and paging."""
    print(f"Performing YouTube search at {current_timestamp()}\n")

    developer_key = options['DEVELOPER_KEY']
    # CREATE YOUTUBE OBJECT calling the youtbue data api 
    youtube = build('youtube', 'v3', developerKey=developer_key)
    
    all_videos = []
    page_token = None
    max_iterations = 2  # Adjust based on how many pages you want to retrieve (50 results max with 25 per page)

    try:
        for _ in range(max_iterations):
            search_response = youtube.search().list(
                q=options['SEARCH_QUERY'],
                part='id,snippet',
                maxResults=options['MAX_RESULTS'],
                order=options['ORDER'],
                type='video',
                videoDuration=options['VIDEO_DURATION'],
                publishedAfter=options['PUBLISHED_AFTER'],
                publishedBefore=options['PUBLISHED_BEFORE'],
                relevanceLanguage=options['RELEVANCE_LANGUAGE'],
                # videoCategoryId=options['VIDEO_CATEGORY_ID'], # When there is a category ID assigned, if number 10, then results are mostly music.
                pageToken=page_token
            ).execute()

            all_videos.extend(search_response.get('items', []))
    
            video_ids = [item['id']['videoId'] for item in search_response.get('items', [])]
    
            # Retrieve additional details for each video by their IDs, excluding restricted parts
            if video_ids:
                details_response = youtube.videos().list(
                    id=','.join(video_ids),
                    part='contentDetails,statistics,status,topicDetails,recordingDetails'
                ).execute()
    
                all_videos.extend(details_response.get('items', []))
    
    except HttpError as e:
        print(f"An HTTP error occurred: {e.resp.status} {e.content} at {current_timestamp()}\n")
    return all_videos

def download_subtitles_dlp(video_id):
    """Download subtitles for a given YouTube video ID."""
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': True,
        'outtmpl': f'subtitles/{video_id}.%(ext)s',
        'quiet': True
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
            subtitle_file = f'subtitles/{video_id}.en.vtt'
            if os.path.exists(subtitle_file):
                with open(subtitle_file, 'r', encoding='utf-8') as file:
                    subtitle_text = file.read()
                    print(f"Downloaded video subtitles for video ID: {video_id} into the file {subtitle_file} at {current_timestamp()}\n")
                return subtitle_file, subtitle_text
            return None, None
    except Exception as e:
        print(f"Failed to download subtitles for video ID: {video_id}: {e} at {current_timestamp()}\n")
        return None, None

def get_transcript_ytapi(video_id):
    try:
        # Attempt to fetch the transcript for the video in English
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        print("Transcript found in English.")
        return transcript
    except NoTranscriptFound as e:
        print("English transcript not found. Attempting to list available languages...")
        try:
            # Fetching the list of available transcript languages for the video
            available_langs = YouTubeTranscriptApi.list_transcripts(video_id)
            manually_created = [lang.language_code for lang in available_langs.manually_created()]
            generated = [lang.language_code for lang in available_langs.generated()]
            print("Manually created transcripts available in: ", manually_created)
            print("Auto-generated transcripts available in: ", generated)
        except Exception as ex:
            print("Failed to fetch available languages due to: ", ex)

# def send_to_vectara(options, vectara, video_details):
#     """Send Youtube Data API results on the video and subtitle transcripts to vectara."""
#     try:
#         # Convert float values to decimals
#         video_details_decimal = convert_floats_to_decimals(video_details)

#         results_table_name = options['RESULTS_TABLE_NAME']
#         results_table = vectara.Table(results_table_name)
#         response = results_table.put_item(Item=video_details_decimal)
#         print(f"Successfully inserted API results and converted transcripts into vectara at {current_timestamp()}\n", response)
#     except Exception as e:
#         print(f"Error inserting into vectara: {e}")

def process_videos_and_store(data, parent_key='', result_dict={}):
    """Process each video from the search results and store them."""

    if isinstance(data, dict):
        for k, v in data.items():
            # Only pass the current key, not the entire path
            extract_last_key_values_to_dict(v, k, result_dict)
    elif isinstance(data, list):
        for i, item in enumerate(data):
            # For list items, keep the index in the key but also indicate it's an item in a list
            current_key = f"{parent_key}[{i}]"
            extract_last_key_values_to_dict(item, parent_key, result_dict)
    else:
        # Directly use parent_key as the final key
        result_dict[parent_key] = data
    
    # send_to_vectara(options, vectara, result_dict)
    print(f"\nFunction 'process_videos_and_store' was run at {current_timestamp()} \n")
    return result_dict

def upload_video_metadata(video_id, api_key, doc_metadata=None, extract_document=False):
    """
    Upload video metadata to the specified endpoint with additional metadata.
    """
    url = 'https://api.vectara.io/v1/upload'
    params = {
        'c': 2441028590,  # Using video_id as customer_id
        'o': 2,  # Using video_id as corpus_id
    }
    if extract_document:
        params['d'] = 'true'
    
    headers = {
        'Accept': 'application/txt',
        'x-api-key': api_key,
        # 'Authorization': f'Bearer {jwt_token}',
    }
    
    data = {}
    if doc_metadata:
        data['doc_metadata'] = json.dumps(doc_metadata)
    
    file_path = f'{video_id}.json'  # File path using video_id with .json extension
    
    # Assuming the metadata file already exists, otherwise, you need to create it
    files = {
        'file': open(file_path, 'rb'),
    }
    
    try:
        response = requests.post(url, params=params, headers=headers, files=files, data=data)
        print(response.text)
    except Exception as e:
        print(f"Failed to upload video metadata for video ID: {video_id}: {e}")
    finally:
        files['file'].close()

def main():
    # Load the .env file
    load_dotenv()
    
    # Ensure the subtitles directory exists
    os.makedirs('subtitles', exist_ok=True)
    
    # Get environment variables
    options = get_env_variables()
        
    # Only proceed if vectara was successfully configured
    if options:
        all_videos = youtube_search_all_videos(options)
        
        # Only proceed if videos were successfully retrieved
        if all_videos:
            result_dict = process_videos_and_store(all_videos, parent_key='', result_dict={})

            video_id = result_dict["videoId"]
            subtitle_file, subtitle_text = download_subtitles_dlp(video_id)
            
            # Initialize attempt counter
            attempts = 0
            max_attempts = 10
            subtitle_text = None
            
            # Loop up to 10 times until subtitles are found
            # while attempts < max_attempts and not subtitle_text:
            if not subtitle_text:
                try:
                    video_id = 'uT6ASPy2Dbs'
                    subtitle_text = get_transcript_ytapi(video_id)
                    if subtitle_text:
                        result_dict['subtitles'] = subtitle_text
                        # Save to a file named "{video_id}.txt"
                        file_path = os.path.join('subtitles', f"{video_id}.txt")
                        with open(file_path, 'w') as file:
                            # Format and write the result_dict as a string if necessary
                            file.write(json.dumps(result_dict, indent=4))
                        print(f"Saved subtitles to {file_path}")

                        api_key = options['VECTARA_KEY']
                        doc_metadata = {"title": result_dict["title"], "description": result_dict["description"]}
                        # Assuming the upload_video_metadata function is defined elsewhere
                        upload_video_metadata(video_id, api_key, doc_metadata=doc_metadata, extract_document=True)
                        # break  # Exit the loop if subtitles are found
                except Exception as e:
                    print(f"Attempt {attempts + 1}: Cannot get transcript for video ID: {video_id}")
                attempts += 1

            if not subtitle_text:
                print(f"Failed to retrieve subtitles after {max_attempts} attempts.")
        else:
            print(f"Failed to retrieve videos at {current_timestamp()}. Exiting...\n")
    else:
        print(f"Failed to retrieve environment variables at {current_timestamp()}. Exiting...\n")


if __name__ == "__main__":
    main()


Performing YouTube search at 2024-02-24 18:05:41


Function 'process_videos_and_store' was run at 2024-02-24 18:05:42 



ERROR: [youtube] 96iaZxKRmKg: 96iaZxKRmKg: Failed to parse JSON (caused by JSONDecodeError("Expecting value in '': line 1 column 1 (char 0)")); please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


Failed to download subtitles for video ID: 96iaZxKRmKg: ERROR: [youtube] 96iaZxKRmKg: 96iaZxKRmKg: Failed to parse JSON (caused by JSONDecodeError("Expecting value in '': line 1 column 1 (char 0)")); please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U at 2024-02-24 18:05:43

Transcript found in English.
Saved subtitles to subtitles/uT6ASPy2Dbs.txt
Attempt 1: Cannot get transcript for video ID: uT6ASPy2Dbs


In [122]:
import requests
import json

def upload_video_metadata(video_id, api_key, doc_metadata=None, extract_document=False):
    """
    Upload video metadata to the specified endpoint with additional metadata.
    """
    url = 'https://api.vectara.io/v1/upload'
    params = {
        'c': 2441028590,  # Using video_id as customer_id
        'o': 2,  # Using video_id as corpus_id
    }
    if extract_document:
        params['d'] = 'true'
    
    headers = {
        'Content-Type: multipart/form-data'
        'Accept': 'application/json',
        'x-api-key': api_key,
        # 'Authorization': f'Bearer {jwt_token}',
    }
    
    data = {}
    if doc_metadata:
        data['doc_metadata'] = json.dumps(doc_metadata)
    
    file_path = f'{video_id}.json'  # File path using video_id with .json extension
    
    # Assuming the metadata file already exists, otherwise, you need to create it
    files = {
        'file': open(file_path, 'rb'),
    }
    
    try:
        response = requests.post(url, params=params, headers=headers, files=files, data=data)
        print(response.text)
    except Exception as e:
        print(f"Failed to upload video metadata for video ID: {video_id}: {e}")
    finally:
        files['file'].close()

# Example usage
video_id = 'uT6ASPy2Dbs'
api_key = 'zut_kX8j7lr5zFJjUnhAf6XHHD83xQT3PUGhQyEBrQ'
# jwt_token = 'your_jwt_token_here'
doc_metadata = {"title": "Sample Video Title", "description": "Sample video description"}

upload_video_metadata(video_id, api_key, doc_metadata=doc_metadata, extract_document=True)


Failed to upload video metadata for video ID: uT6ASPy2Dbs: Invalid leading whitespace, reserved character(s), or returncharacter(s) in header name: 'Content-Type: multipart/form-dataAccept'


In [130]:
import requests

def upload_to_vectara(api_key, payload={}, additional_headers=None):
    """
    Upload data to the Vectara API.

    :param api_key: API key for authentication.
    :param payload: The payload to be uploaded.
    :param additional_headers: Any additional headers to be included in the request.
    """
    url = "https://api.vectara.io/v1/upload"

    # Default headers
    headers = {
        'Content-Type': 'multipart/form-data',
        'Accept': 'application/json',
        'x-api-key': api_key,
        # 'file=@"/Users/lilysu/git/PatternPivot/base.txt"',
    }
    
    # If there are any additional headers, update the default headers with them
    if additional_headers:
        headers.update(additional_headers)

    try:
        response = requests.post(url, headers=headers, data=payload)
        print("Response Status Code:", response.status_code)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

# Example usage of the function
api_key = 'zut_kX8j7lr5zFJjUnhAf6XHHD83xQT3PUGhQyEBrQ'  # Replace <API_KEY_VALUE> with your actual API key
payload = {}  # Your payload here
additional_headers = None  # Any additional headers you want to include

response_text = upload_to_vectara(api_key, payload, additional_headers)
print(response_text)


Response Status Code: 400
{"httpCode":400,"internalCode":3,"details":"Request entity too large.","status":{}}


In [121]:
import requests
import json

url = "https://api.vectara.io/v1/query"

payload = json.dumps({
  "query": [
    {
      "query": "What is there to be thankful for?",
      "start": 0,
      "numResults": 10,
      "contextConfig": {
        "sentencesBefore": 2,
        "sentencesAfter": 2,
        "startTag": "<b>",
        "endTag": "</b>"
      },
      "corpusKey": [
        {
          "customerId": 2441028590,
          "corpusId": 2,
          "semantics": "DEFAULT",
          "lexicalInterpolationConfig": {
            "lambda": 0
          }
        }
      ],
      "summary": [
        {
          "maxSummarizedResults": 5,
          "responseLang": "eng",
        }
      ]
    }
  ]
})


headers = {
  'Content-Type': 'application/json',
  'Accept': 'application/json',
  'customer-id': '2441028590',
  'x-api-key': 'zut_kX8j7lr5zFJjUnhAf6XHHD83xQT3PUGhQyEBrQ'
}

response = requests.request("POST", url, headers=headers, data=payload)

# print(response.text)

# Check if the response status code is 200 (OK) to ensure the request was successful
if response.status_code == 200:
    try:
        # Parse the response JSON content
        data = response.json()

        # Navigate through the nested structure to extract the summary
        summaries = data['responseSet'][0]['summary']
        
        # Assuming there's at least one summary in the list and extracting the first one for simplicity
        summary_text = summaries[0]['text'] if summaries else 'No summary found.'

        # Print the formatted summary
        print("Summary:\n")
        print(summary_text)
    except KeyError as e:
        print(f"Key not found in response: {e}")
    except IndexError as e:
        print(f"Index error, might be due to unexpected response structure: {e}")
else:
    print(f"Request failed with status code: {response.status_code}, Response text: {response.text}")

Summary:

There are many things to be thankful for in life. Today is an opportunity to love and grow [1]. Feel grateful for your life and trust that everything is happening for your greatest good [2]. You are exactly where you're meant to be, and each new day is a chance to shine as your highest self [3]. Embrace the beauty and meaning in your life, and welcome the abundance flowing into your life with gratitude [4]. Choose joy, peace, balance, and light. Opt for love over fear and affirm your safety [5].
