# Scrapping Code using Youtube API Key

#### The below code will enable the user to scrape the following meta data from Youtube based on the Keywords added below

+ Search_Term
+ Video_URL
+ Video_Title
+ Channel_Name
+ Total_Subscribers
+ Total_Views
+ Total_Likes
+ Total_Dislikes
+ Total_Comments
+ Video_Description
+ Total_times_User_added_to_favorites
+ Video_Length
+ Video_Category
+ Video_Tags
+ Age_Restrictions
+ License
+ Captions_Available
+ Video Comments

# Part 1 : Created a Dataframe (raw_youtube_data) with the list of URLs and associated Metadata

In [None]:
# # install necessary librarires if already not available
# !pip install google-api-python-client
# !pip install google-auth-oauthlib
# !pip install isodate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate
Successfully installed isodate-0.6.1


In [None]:
# import the necessary libraries
import os
import time
import isodate
import requests
import datetime
import warnings
import google.auth
import pandas as pd
from bs4 import BeautifulSoup
import googleapiclient.errors
import google_auth_oauthlib.flow
import googleapiclient.discovery
warnings.filterwarnings("ignore")
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [None]:
# Start the timer
start_time = datetime.datetime.now()

In [None]:
# Setting up API Key through a project in the Google Developer Console and enable YouTube Data API v3 to get one
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "AIzaSyDqO4jitopmbQEQoEIYVW18bEmYKI4HDdU"
# DEVELOPER_KEY = "AIzaSyAeqZ9sTvWhITtEpv-Z-P3Ropd9fYNlfEs"

In [None]:
# List of keywords based on which the data has to be scrapped
# keywords = ["stress management",
#             "relaxation techniques",
#             "mindfulness",
#             "Meditation",
#             "Breathing exercises",
#             "Yoga for stress relief",
#             "Guided visualization",
#             "Coping with stress",
#             "Stress relief techniques"]

keywords = [
    "Emotional exhaustion",
    "Stress reduction and burnout",
    "Burnout and emotions",
    "Mindfulness and burnout",
    "Workload management",
    "Emotional well-being techniques",
    "Work-life balance techniques",
    "Depersonalization and burnout",
    "Job satisfaction and burnout",
    "Building empathy in the workplace",
    "Managing workplace cynicism",
    "Resilience building",
    "Empathy and emotional connection",
    "Self-care practices",
    "Personal accomplishment and burnout",
    "Motivation and job satisfaction",
    "Empowerment self at work",
    "Enhancing self-efficacy",
    "Goal setting and achievement",
    "Recognizing achievements and milestones",
    "Fostering growth and development"
]


In [None]:
#no of videos that needs to be scrapped for each key words
no_of_videos_to_be_scrapped = 30

In [None]:
# Create a YouTube client instance
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=DEVELOPER_KEY)

In [None]:
# Define a function to search for videos based on a query
def search_videos(query):
    request = youtube.search().list(
        part="id",
        maxResults=no_of_videos_to_be_scrapped,
        q=query,
        type="video"
    )
    response = request.execute()
    return response['items']

In [None]:
def get_video_info(search_results):
    video_info = []
    for search_result in search_results:
        video_id = search_result['id']['videoId']
        video_request = youtube.videos().list(
            part="snippet,statistics,contentDetails,status",
            id=video_id
        )
        video_response = video_request.execute()
        video_title = video_response['items'][0]['snippet']['title']
        channel_title = video_response['items'][0]['snippet']['channelTitle']
        subscriber_count = video_response['items'][0]['statistics'].get('subscriberCount', 0)
        view_count = video_response['items'][0]['statistics']['viewCount']
        like_count = video_response['items'][0]['statistics'].get('likeCount', 0)
        dislike_count = video_response['items'][0]['statistics'].get('dislikeCount', 0)
        comment_count = video_response['items'][0]['statistics'].get('commentCount', 0)
        video_description = video_response['items'][0]['snippet']['description']
        favorite_count = video_response['items'][0]['statistics'].get('favoriteCount', 0)
        video_duration = video_response['items'][0]['contentDetails'].get('duration', 'NA')
        video_category = video_response['items'][0]['snippet'].get('categoryId', 'NA')
        video_tags = video_response['items'][0]['snippet'].get('tags', [])
        age_restricted = video_response['items'][0]['contentDetails'].get('contentRating', {}).get('ytRating', 'NA')
        license = video_response['items'][0]['status'].get('license', 'NA')
        caption_status = video_response['items'][0]['snippet'].get('caption', 'NA')
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        video_data = {
            "Search_Term": keyword, # Add keyword to the video_data dictionary
            "Video_URL": video_url,
            "Video_Title": video_title,
            "Channel_Name": channel_title,
            "Total_Subscribers": subscriber_count,
            "Total_Views": view_count,
            "Total_Likes": like_count,
            "Total_Dislikes": dislike_count,
            "Total_Comments": comment_count,
            "Video_Description": video_description,
            "Total_times_User_added_to_favorites": favorite_count,
            "Video_Length": video_duration,
            "Video_Category": video_category,
            "Video_Tags": video_tags,
            "Age_Restrictions":age_restricted,
            "License": license,
            "Captions_Available": caption_status
        }
        video_info.append(video_data)
    return video_info

In [None]:
# Create an empty list to store the video information
video_data_list = []

In [None]:
# Loop through the keywords, search for videos, and store the metadata in the list
for keyword in keywords:
    search_results = search_videos(keyword)
    video_info = get_video_info(search_results)
    for video_data in video_info:
        video_data_list.append(video_data)

In [None]:
# Create a pandas dataframe from the list of dictionaries
raw_youtube_data = pd.DataFrame(video_data_list)

In [None]:
# Print the dataframe
raw_youtube_data.head(2)

Unnamed: 0,Search_Term,Video_URL,Video_Title,Channel_Name,Total_Subscribers,Total_Views,Total_Likes,Total_Dislikes,Total_Comments,Video_Description,Total_times_User_added_to_favorites,Video_Length,Video_Category,Video_Tags,Age_Restrictions,License,Captions_Available
0,Emotional exhaustion,https://www.youtube.com/watch?v=tVhEz_jceuU,8 Things That Lead To Emotional Exhaustion,Psych2Go,0,183411,10798,0,726,Ever feel like life is stressful every single ...,0,PT6M1S,27,"[emotional exhaustion, 8 Things That Lead To E...",,creativeCommon,
1,Emotional exhaustion,https://www.youtube.com/watch?v=CDvnw8eOjFQ,7 Signs of Emotional Burnout,Psych2Go,0,2255329,99671,0,8098,Emotional burnout is defined as a negative sta...,0,PT6M14S,27,"[emotional burnout, emotional burnout symptoms...",,creativeCommon,


In [None]:
print("Total number of URL metadata scraped:",
      len(raw_youtube_data))

Total number of URL metadata scraped: 630


In [None]:
def get_duration(duration):
    # Convert ISO 8601 duration to seconds
    duration_in_seconds = isodate.parse_duration(duration).total_seconds()
    # Convert seconds to hh:mm:ss format
    duration_in_hms = time.strftime('%H:%M:%S',
                                    time.gmtime(duration_in_seconds))
    return duration_in_hms

In [None]:
def clean_duration(duration_str):
    if 'PT' in duration_str:
        duration = duration_str.replace('PT', '')
        if 'H' in duration:
            hours, duration = duration.split('H')
        else:
            hours = '00'
        if 'M' in duration:
            minutes, duration = duration.split('M')
        else:
            minutes = '00'
        if 'S' in duration:
            seconds = duration.split('S')[0]
        else:
            seconds = '00'
        return f'{hours.zfill(2)}:{minutes.zfill(2)}:{seconds.zfill(2)}'
    else:
        return duration_str

In [None]:
# apply the clean_duration function to the Video_Length column
raw_youtube_data['Video_Length'] = raw_youtube_data['Video_Length'].apply(clean_duration)

In [None]:
# Print the modified dataframe with correct time format
raw_youtube_data.head(2)

Unnamed: 0,Search_Term,Video_URL,Video_Title,Channel_Name,Total_Subscribers,Total_Views,Total_Likes,Total_Dislikes,Total_Comments,Video_Description,Total_times_User_added_to_favorites,Video_Length,Video_Category,Video_Tags,Age_Restrictions,License,Captions_Available
0,Emotional exhaustion,https://www.youtube.com/watch?v=tVhEz_jceuU,8 Things That Lead To Emotional Exhaustion,Psych2Go,0,183411,10798,0,726,Ever feel like life is stressful every single ...,0,00:06:01,27,"[emotional exhaustion, 8 Things That Lead To E...",,creativeCommon,
1,Emotional exhaustion,https://www.youtube.com/watch?v=CDvnw8eOjFQ,7 Signs of Emotional Burnout,Psych2Go,0,2255329,99671,0,8098,Emotional burnout is defined as a negative sta...,0,00:06:14,27,"[emotional burnout, emotional burnout symptoms...",,creativeCommon,


In [None]:
# Replace with your own API key
# api_key = "AIzaSyAeqZ9sTvWhITtEpv-Z-P3Ropd9fYNlfEs"

youtube = build('youtube',
                'v3',
                developerKey=DEVELOPER_KEY)

categories_response = youtube.videoCategories().list(
    part='snippet',
    regionCode='US'
).execute()

In [None]:
# Extract the category IDs and their corresponding names
categories = {}
for category in categories_response['items']:
    categories[category['id']] = category['snippet']['title']

In [None]:
# Map category IDs to category names
raw_youtube_data['Video_Category'] = raw_youtube_data['Video_Category'].map(categories)

In [None]:
# Print the modified dataframe with the categories name
raw_youtube_data.head(2)

Unnamed: 0,Search_Term,Video_URL,Video_Title,Channel_Name,Total_Subscribers,Total_Views,Total_Likes,Total_Dislikes,Total_Comments,Video_Description,Total_times_User_added_to_favorites,Video_Length,Video_Category,Video_Tags,Age_Restrictions,License,Captions_Available
0,Emotional exhaustion,https://www.youtube.com/watch?v=tVhEz_jceuU,8 Things That Lead To Emotional Exhaustion,Psych2Go,0,183411,10798,0,726,Ever feel like life is stressful every single ...,0,00:06:01,Education,"[emotional exhaustion, 8 Things That Lead To E...",,creativeCommon,
1,Emotional exhaustion,https://www.youtube.com/watch?v=CDvnw8eOjFQ,7 Signs of Emotional Burnout,Psych2Go,0,2255329,99671,0,8098,Emotional burnout is defined as a negative sta...,0,00:06:14,Education,"[emotional burnout, emotional burnout symptoms...",,creativeCommon,


In [None]:
def get_subscriber_count(channel_name, api_key):
    # Construct the API request URL
    url = f"https://www.googleapis.com/youtube/v3/channels?part=statistics&forUsername={channel_name}&key={api_key}"

    # Send the API request
    response = requests.get(url)

    # Check if the API request was successful
    if response.status_code != 200:
        print(f"Error: API request failed with status code {response.status_code}")
        return None

    # Parse the API response
    response_dict = response.json()

    # Check if the response contains the expected keys
    if "items" not in response_dict or len(response_dict["items"]) == 0 or "statistics" not in response_dict["items"][0]:
        print(f"Error: API response is missing expected keys for channel {channel_name}")
        return None

    # Extract the subscriber count from the channel's statistics
    subscriber_count = int(response_dict["items"][0]["statistics"]["subscriberCount"])

    return subscriber_count

In [None]:
# Iterate through each row in the dataframe and update the Total_Subscribers column
for index, row in raw_youtube_data.iterrows():
    channel_name = row["Channel_Name"]
    subscriber_count = get_subscriber_count(channel_name, DEVELOPER_KEY)
    if subscriber_count is not None:
        raw_youtube_data.at[index, "Total_Subscribers"] = subscriber_count
    else:
        raw_youtube_data.at[index, "Total_Subscribers"] = "unknown"

Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel Steven Furtick
Error: API response is missing expected keys for channel Dr Julie
Error: API response is missing expected keys for channel Nena Lavonne, Psy.M.
Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel TEDx Talks
Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel The Chocolate Therapist
Error: API response is missing expected keys for channel Psych2Go
Error: API response is missing expected keys for channel Psychology Element
Error: API response is missing expected keys for channel The Vine
Error: API response is missing 

In [None]:
# Print the modified dataframe with the subscriber count
raw_youtube_data.head(2)

Unnamed: 0,Search_Term,Video_URL,Video_Title,Channel_Name,Total_Subscribers,Total_Views,Total_Likes,Total_Dislikes,Total_Comments,Video_Description,Total_times_User_added_to_favorites,Video_Length,Video_Category,Video_Tags,Age_Restrictions,License,Captions_Available
0,Emotional exhaustion,https://www.youtube.com/watch?v=tVhEz_jceuU,8 Things That Lead To Emotional Exhaustion,Psych2Go,unknown,183411,10798,0,726,Ever feel like life is stressful every single ...,0,00:06:01,Education,"[emotional exhaustion, 8 Things That Lead To E...",,creativeCommon,
1,Emotional exhaustion,https://www.youtube.com/watch?v=CDvnw8eOjFQ,7 Signs of Emotional Burnout,Psych2Go,unknown,2255329,99671,0,8098,Emotional burnout is defined as a negative sta...,0,00:06:14,Education,"[emotional burnout, emotional burnout symptoms...",,creativeCommon,


In [None]:
len(raw_youtube_data)

630

In [None]:
raw_youtube_data['search_term_video_url'] = raw_youtube_data['Search_Term'] + '_' + raw_youtube_data['Video_URL']

In [None]:
unique_values = raw_youtube_data['search_term_video_url'].unique()

# print the unique values
print(len(unique_values))

630


In [None]:
raw_youtube_data['domain'] = ''

emotional_exhaustion_keywords = [
    "Emotional exhaustion",
    "Stress reduction and burnout",
    "Burnout and emotions",
    "Mindfulness and burnout",
    "Workload management",
    "Emotional well-being techniques",
    "Work-life balance techniques"
]

depersonalization_keywords = [
    "Depersonalization and burnout",
    "Job satisfaction and burnout",
    "Building empathy in the workplace",
    "Managing workplace cynicism",
    "Resilience building",
    "Empathy and emotional connection",
    "Self-care practices"
]

personal_achievement_keywords = [
    "Personal accomplishment and burnout",
    "Motivation and job satisfaction",
    "Empowerment self at work",
    "Enhancing self-efficacy",
    "Goal setting and achievement",
    "Recognizing achievements and milestones",
    "Fostering growth and development"
]

raw_youtube_data.loc[raw_youtube_data['Search_Term'].isin(emotional_exhaustion_keywords), 'domain'] = 'Emotional exhaustion'
raw_youtube_data.loc[raw_youtube_data['Search_Term'].isin(depersonalization_keywords), 'domain'] = 'Depersonalization'
raw_youtube_data.loc[raw_youtube_data['Search_Term'].isin(personal_achievement_keywords), 'domain'] = 'Personal achievement'

In [None]:
raw_youtube_data.to_csv('yt_videos_meta.csv', index=False)

In [None]:
raw_youtube_data_db = raw_youtube_data.rename(columns={
    'Search_Term': 'search_term',
    'Video_URL': 'video_url',
    'Video_Title': 'video_title',
    'Channel_Name': 'channel_name',
    'Total_Subscribers': 'total_subscribers',
    'Total_Views': 'total_views',
    'Total_Likes': 'total_likes',
    'Total_Dislikes': 'total_dislikes',
    'Total_Comments': 'total_comments',
    'Video_Description': 'video_desc',
    'Total_times_User_added_to_favorites': 'total_times_User_added_to_favorites',
    'Video_Length': 'video_length',
    'Video_Category': 'video_category',
    'Video_Tags': 'video_tag',
    'Age_Restrictions': 'age_restrictions',
    'License': 'license',
    'Captions_Available': 'captions_available',
    'search_term_video_url':'search_term_video_url',
    'domain':'domain'
})


In [None]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd

# Connect to the PostgreSQL database
host = '173.249.1.23'
port = '5432'
db = 'dj_project'
user = 'postgres'
password = 'Password'
conn_str = f'postgresql://{user}:{password}@{host}:{port}/{db}'
engine = create_engine(conn_str)


# Write the dataframe to a PostgreSQL table
table_name = 'youtube_videos_meta'
raw_youtube_data_db.to_sql(table_name, engine, if_exists='append', index=False)


630

# Part 2 : Created a new Dataframe (raw_youtube_data_comments) with the list of URLs and associated comments

In [None]:
# Load the previously saved youtube data
raw_youtube_data = raw_youtube_data

In [None]:
raw_youtube_data_comments = pd.DataFrame(columns=['Video_URL', 'Comment'])
videos_with_disabled_comments = []

In [None]:
for url in raw_youtube_data['Video_URL']:
    video_id = url.split('=')[1]
    video_response = youtube.videos().list(
        part='snippet,statistics',
        id=video_id
    ).execute()

    try:
        # Check if comments are disabled for the video
        comment_count = video_response['items'][0]['statistics'].get('commentCount', None)
        if comment_count is None or comment_count == '0':
            print(f'Comments are disabled for {url}')
            videos_with_disabled_comments.append(url)
            raw_youtube_data_comments = raw_youtube_data_comments.append({'Video_URL': url,
                                                                          'Comment': 'Video owner has disabled comments on their video'},
                                                                         ignore_index=True)
            continue

        # Use the API to get the comments for the video
        comments = []
        next_page_token = None

        while True:
            comment_response = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token
            ).execute()

            # Extract the comments from the response
            for item in comment_response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)

            # Check if there are more comments
            if 'nextPageToken' in comment_response:
                next_page_token = comment_response['nextPageToken']
            else:
                break

        # Save the comments to the DataFrame
        for comment in comments:
            raw_youtube_data_comments = raw_youtube_data_comments.append({'Video_URL': url,
                                                                          'Comment': comment},
                                                                         ignore_index=True)

    except HttpError as error:
        print(f'An HTTP error occurred: {error}')
        continue

    except KeyError:
        print(f'Comments are disabled for {url}')
        raw_youtube_data_comments = raw_youtube_data_comments.append({'Video_URL': url,
                                                                      'Comment': 'Video owner has disabled comments on their video'},
                                                                     ignore_index=True)
        continue

Comments are disabled for https://www.youtube.com/watch?v=ZUw5_EeO0xc
Comments are disabled for https://www.youtube.com/watch?v=C1ntSs5CgNQ
Comments are disabled for https://www.youtube.com/watch?v=ceIR-g4wU4Y
Comments are disabled for https://www.youtube.com/watch?v=aaVJrATVe1w
Comments are disabled for https://www.youtube.com/watch?v=BAsGjOxriIk
Comments are disabled for https://www.youtube.com/watch?v=7KE6OTHiw-U
Comments are disabled for https://www.youtube.com/watch?v=LfPY4OfMOCg
Comments are disabled for https://www.youtube.com/watch?v=4VztJH2k99E
Comments are disabled for https://www.youtube.com/watch?v=HXA330433uI
Comments are disabled for https://www.youtube.com/watch?v=NsyO7NQy0fE
Comments are disabled for https://www.youtube.com/watch?v=VGBQ_PjQ890
Comments are disabled for https://www.youtube.com/watch?v=jO-vcvhKC4I
Comments are disabled for https://www.youtube.com/watch?v=BNajuzhEC_A
Comments are disabled for https://www.youtube.com/watch?v=Po9katvlldw
Comments are disable

In [None]:
raw_youtube_data_comments.head()

Unnamed: 0,Video_URL,Comment
0,https://www.youtube.com/watch?v=tVhEz_jceuU,Are you already feeling emotionally exhausted?
1,https://www.youtube.com/watch?v=tVhEz_jceuU,her voice maks me sleepy
2,https://www.youtube.com/watch?v=tVhEz_jceuU,you should put an ASMR warning. i want to puke...
3,https://www.youtube.com/watch?v=tVhEz_jceuU,I just started crying
4,https://www.youtube.com/watch?v=tVhEz_jceuU,💔😢🙁😭


In [None]:
mask = raw_youtube_data_comments['Video_URL'].str.startswith('https://www.youtube.com/wa')

In [None]:
print(len(mask))

316581


In [None]:
raw_youtube_data_comments.to_csv('video_comments.csv')

In [None]:
# Write the dataframe to a PostgreSQL table
table_name = 'yt_vid_comments'
raw_youtube_data_comments.to_sql(table_name, engine, if_exists='append', index=False)


581

In [None]:
print("Total number of comments downloaded:",
      len(raw_youtube_data_comments))

Total number of comments downloaded: 316581


In [None]:
# End the timer
end_time = datetime.datetime.now()

In [None]:
# Calculate the total time taken to run the notebook
elapsed_time  = end_time - start_time

In [None]:
# Check if the elapsed_time object is a timedelta object
if isinstance(elapsed_time, datetime.timedelta):
    # Print the total time taken in hh:mm:ss format
    print("Elapsed time (hh:mm:ss):", str(elapsed_time))
else:
    print("Error: elapsed_time object is not a timedelta object")

Elapsed time (hh:mm:ss): 0:20:01.333568


In [None]:
emotional_keywords = [
    "Overcoming emotional exhaustion at work",
    "Coping with people-related work stress",
    "Managing work-induced breakdown",
    "Reducing frustration at work",
    "Preventing excessive workload",
    "Stress management in client-facing roles",
    "Building resilience in high-stress environments"
]

professions = [
    'Sales Executive',
    'Nursing Staff',
    'Customer Service Representative',
    'Marketing Professionals'
]

age_groups = ['young', 'middle-aged', 'elderly']

genders = ['Female', 'Male']

combinations = []

for keyword in emotional_keywords:
    for gender in genders:
        for age_group in age_groups:
            for profession in professions:
                combination = keyword + " among " + gender + " " + age_group + " " + profession
                combinations.append(combination)

# Print the list of combinations
for combination in combinations:
    print(combination)

