## 1. Setting Up the Environment and Fetching the API Key

First, we need to import the necessary libraries and fetch our YouTube API key from the environment variables.

In [None]:
import os
import csv
import googleapiclient.discovery
from youtube_transcript_api import YouTubeTranscriptApi
from datetime import timedelta

# Fetch API key from environment variable
API_KEY = os.getenv('YOUTUBE_API_KEY')


## 2. Searching for Playlists Based on a Query

Next, we will define a function to search for playlists based on a query and check their length and average video duration.


In [None]:
def search_playlist(query):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.search().list(
        part='snippet',
        q=query,
        type='playlist',
        maxResults=50  # Increase the number of results to check more playlists
    )

    response = request.execute()

    if 'items' in response and len(response['items']) > 0:
        for playlist in response['items']:
            playlist_id = playlist['id']['playlistId']
            playlist_title = playlist['snippet']['title']
            print(f'Checking Playlist: {playlist_title} (ID: {playlist_id})')
            
            # Check playlist length and average video duration
            if check_playlist_length(playlist_id) and check_average_video_duration(playlist_id):
                print(f'Playlist Title: {playlist_title}')
                print(f'Playlist ID: {playlist_id}')
                return playlist_id
            else:
                print('Playlist does not meet the required length or average duration criteria.')

    print('No suitable playlists found.')
    return None


## 3. Checking Playlist Length and Average Video Duration

We will now define functions to check the playlist length and the average video duration.


In [None]:
def check_playlist_length(playlist_id):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults=1
    )

    response = request.execute()

    total_results = response['pageInfo']['totalResults']
    return 15 <= total_results <= 25

def check_average_video_duration(playlist_id):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults=50  # Adjust as necessary to get all videos
    )

    response = request.execute()
    video_ids = [item['contentDetails']['videoId'] for item in response['items']]

    total_duration = timedelta()
    for video_id in video_ids:
        duration = get_video_duration(video_id)
        total_duration += duration

    average_duration = total_duration / len(video_ids)
    return average_duration < timedelta(minutes=11)


## 4. Extracting Video Details

We will define functions to get video duration, parse the duration, get video view count, get video transcript, and extract video details.


In [None]:
def get_video_duration(video_id):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.videos().list(
        part='contentDetails',
        id=video_id
    )

    response = request.execute()

    if 'items' in response and len(response['items']) > 0:
        duration = response['items'][0]['contentDetails']['duration']
        return parse_duration(duration)
    else:
        return timedelta()

def parse_duration(duration):
    # Parse ISO 8601 duration
    # For example, PT1H2M3S is 1 hour, 2 minutes, and 3 seconds
    days = hours = minutes = seconds = 0
    time_str = duration[2:]  # Remove 'PT' prefix

    if 'D' in time_str:
        days, time_str = time_str.split('D')
        days = int(days)

    if 'H' in time_str:
        hours, time_str = time_str.split('H')
        hours = int(hours)

    if 'M' in time_str:
        minutes, time_str = time_str.split('M')
        minutes = int(minutes)

    if 'S' in time_str:
        seconds = time_str.split('S')[0]
        seconds = int(seconds)

    return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)


## 5. Getting Video Duration

We will define a function to get the duration of a video by its video ID.


In [None]:
def get_video_duration(video_id):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.videos().list(
        part='contentDetails',
        id=video_id
    )

    response = request.execute()

    if 'items' in response and len(response['items']) > 0:
        duration = response['items'][0]['contentDetails']['duration']
        return parse_duration(duration)
    else:
        return timedelta()


## 6. Parsing Video Duration

We will define a function to parse the ISO 8601 duration format returned by the YouTube API into a Python `timedelta` object.


In [None]:
def parse_duration(duration):
    # Parse ISO 8601 duration
    # For example, PT1H2M3S is 1 hour, 2 minutes, and 3 seconds
    days = hours = minutes = seconds = 0
    time_str = duration[2:]  # Remove 'PT' prefix

    if 'D' in time_str:
        days, time_str = time_str.split('D')
        days = int(days)

    if 'H' in time_str:
        hours, time_str = time_str.split('H')
        hours = int(hours)

    if 'M' in time_str:
        minutes, time_str = time_str.split('M')
        minutes = int(minutes)

    if 'S' in time_str:
        seconds = time_str.split('S')[0]
        seconds = int(seconds)

    return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)


## 7. Getting Video View Count

We will define a function to get the view count of a video by its video ID.


In [None]:
def get_video_viewcount(video_id):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.videos().list(
        part='statistics',
        id=video_id
    )

    response = request.execute()

    if 'items' in response and len(response['items']) > 0:
        view_count = response['items'][0]['statistics']['viewCount']
        return view_count
    else:
        return 'View count not available'


## 8. Getting Video Transcript

We will define a function to get the transcript of a video by its video ID using the `YouTubeTranscriptApi`.


In [None]:
def get_video_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ' '.join([line['text'] for line in transcript]).replace('\n', ' ')  # Remove newlines from transcript
        return text
    except Exception as e:
        print(f'Error fetching transcript: {str(e)}')
        return 'Transcript not available'


## 9. Extracting Video Details

We will define a function to extract details of videos in a playlist, including title, description, URL, view count, and transcript.


In [None]:
def get_video_details(playlist_id, filepath):
    youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.playlistItems().list(
        part='snippet',
        playlistId=playlist_id,
        maxResults=20  
    )

    response = request.execute()

    # Create a list to store all video information
    videos_info = []

    for item in response['items']:
        video_id = item['snippet']['resourceId']['videoId']
        video_title = item['snippet']['title'].replace('\n', ' ') 
        video_description = item['snippet']['description'].replace('\n', ' ')  # Remove newlines from description
        video_url = f'https://www.youtube.com/watch?v={video_id}'
        video_viewcount = get_video_viewcount(video_id)
        video_transcript = get_video_transcript(video_id)

        # Collect video information into a dictionary
        information = {
            'Title': video_title.replace('\n', ' ') ,
            'Description': video_description.replace('\n', ' ') ,
            'URL': video_url,
            'Transcript': video_transcript[:25000].replace('\n', ' '),
            'ViewCount': video_viewcount,
            'date': 0,  # You can set the date or any other metadata here
            'video-index': 0  # You can set the index or any other metadata here
        }

        print(information)
        videos_info.append(information)

    # Write collected information to CSV
    write_to_csv(videos_info, filepath)


## 10. Writing the Extracted Data to a CSV File

We will define a function to write the extracted video details to a CSV file.


In [None]:
def write_to_csv(data, filepath):
    # Specify the CSV file path
    csv_file_path = filepath

    # Write the data to the CSV file
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file):
        fieldnames = ['Title', 'Description', 'URL', 'Transcript', 'ViewCount', 'date', 'video-index']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write the header row
        writer.writeheader()

        # Write the data rows
        writer.writerows(data)


## Putting It All Together

Here is how you can use the defined functions to search for a playlist, extract video details, and write them to a CSV file.


In [None]:
# Example usage
query = "Python Programming"
playlist_id = search_playlist(query)

if playlist_id:
    filepath = "/tmp/youtube_data.csv"
    get_video_details(playlist_id, filepath)
    print(f'Data written to {filepath}')
else:
    print('No suitable playlists found.')
