In [2]:
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_video(video_info, output_path):
    ydl_opts = {
        'format': 'best[height<=720][ext=mp4]',
        'outtmpl': output_path,
        'quiet': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_info['webpage_url']])

def get_transcript(video_id):
    try:
        return YouTubeTranscriptApi.get_transcript(video_id)
    except:
        return "Transcript not available"

def process_video(video, topic_folder):
    video_id = video['id']
    video_url = video['webpage_url']
    
    output_path = os.path.join(topic_folder, "videos", f"{video_id}.mp4")
    download_video(video, output_path)
    
    transcript = get_transcript(video_id)
    
    processed_transcript = []
    full_transcript_text = ""
    
    if isinstance(transcript, list):
        for i, entry in enumerate(transcript):
            start_time = entry['start']
            end_time = start_time + entry['duration']
            if i < len(transcript) - 1:
                end_time = min(end_time, transcript[i+1]['start'])
            
            processed_transcript.append({
                'text': entry['text'],
                'start_time': start_time,
                'end_time': end_time
            })
            full_transcript_text += entry['text'] + " "
    else:
        full_transcript_text = transcript
    
    video_data = {
        'video_id': video_id,
        'title': video['title'],
        'description': video['description'],
        'view_count': video['view_count'],
        'like_count': video.get('like_count', 'N/A'),
        'duration': video['duration'],
        'upload_date': video['upload_date'],
        'channel': video['channel'],
        'full_transcript': full_transcript_text.strip(),
        'transcript_with_timings': processed_transcript,
        'video_file': output_path
    }
    
    return video_data

def scrape_youtube_data(topic, num_videos=10, output_folder='output'):
    topic_folder = os.path.join(output_folder, topic.replace(" ", "_"))
    os.makedirs(os.path.join(topic_folder, "videos"), exist_ok=True)
    
    search_url = f"ytsearch{num_videos}:{topic}"
    
    ydl_opts = {'quiet': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        search_results = ydl.extract_info(search_url, download=False)['entries']

    scraped_data = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_video = {executor.submit(process_video, video, topic_folder): video for video in search_results}
        for future in as_completed(future_to_video):
            video_data = future.result()
            scraped_data.append(video_data)

    return {
        'topic': topic,
        'folder': topic_folder,
        'videos': scraped_data
    }

if __name__ == "__main__":
    topics = ["Linear Regression", "Logistic Regression", "Support Vector Machine", "Decision Tree", "Random Forest", "Naive Bayes Classifer", "DBSCAN", "K-Means Clustering", "Hierarchical Clustering", "Principal Component Analysis", "Convolutional Neural Network"]
    output_base_folder = "video_data"
    all_data = []
    
    for topic in topics:
        print(f"Scraping data for 10 videos on the topic '{topic}'...")
        topic_data = scrape_youtube_data(topic, num_videos=10, output_folder=output_base_folder)
        all_data.append(topic_data)
        print(f"Scraped data for {len(topic_data['videos'])} videos on '{topic}'")
        print(f"Videos saved in {topic_data['folder']}")
        print()

    # Save all data to a single JSON file
    with open("all_scraped_data.json", "w", encoding='utf-8') as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)

    print("Scraping completed for all topics.")
    print("All data saved to all_scraped_data.json")

Scraping data for 10 videos on the topic 'Linear Regression'...




Scraped data for 10 videos on 'Linear Regression'          
Videos saved in video_data\Linear_Regression

Scraping data for 10 videos on the topic 'Logistic Regression'...




Scraped data for 10 videos on 'Logistic Regression'         
Videos saved in video_data\Logistic_Regression

Scraping data for 10 videos on the topic 'Support Vector Machine'...




Scraped data for 10 videos on 'Support Vector Machine'      
Videos saved in video_data\Support_Vector_Machine

Scraping data for 10 videos on the topic 'Decision Tree'...




Scraped data for 10 videos on 'Decision Tree'              
Videos saved in video_data\Decision_Tree

Scraping data for 10 videos on the topic 'Random Forest'...




Scraped data for 10 videos on 'Random Forest'               
Videos saved in video_data\Random_Forest

Scraping data for 10 videos on the topic 'Naive Bayes Classifer'...




Scraped data for 10 videos on 'Naive Bayes Classifer'       
Videos saved in video_data\Naive_Bayes_Classifer

Scraping data for 10 videos on the topic 'DBSCAN'...




Scraped data for 10 videos on 'DBSCAN'                     
Videos saved in video_data\DBSCAN

Scraping data for 10 videos on the topic 'K-Means Clustering'...




Scraped data for 10 videos on 'K-Means Clustering'          
Videos saved in video_data\K-Means_Clustering

Scraping data for 10 videos on the topic 'Hierarchical Clustering'...




Scraped data for 10 videos on 'Hierarchical Clustering'     
Videos saved in video_data\Hierarchical_Clustering

Scraping data for 10 videos on the topic 'Principal Component Analysis'...




Scraped data for 10 videos on 'Principal Component Analysis'
Videos saved in video_data\Principal_Component_Analysis

Scraping data for 10 videos on the topic 'Convolutional Neural Network'...




Scraped data for 10 videos on 'Convolutional Neural Network'
Videos saved in video_data\Convolutional_Neural_Network

Scraping completed for all topics.
All data saved to all_scraped_data.json
