# Importing Libraries

In [1]:
# Importing Packages
import pandas as pd
from google.cloud import storage
import os
from dotenv import load_dotenv
import io
from googleapiclient.discovery import build
from datetime import timedelta, datetime
import datetime

# Setting Variables

In [2]:
# Variables
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
bucket_name = os.getenv('BUCKET_NAME')
folder_path = os.getenv('FOLDER_PATH')
project_name = os.getenv('PROJECT_NAME')
running_data_folder = os.getenv('RUNNING_DATA_FOLDER')
client = storage.Client()
blobs = client.list_blobs(bucket_name, prefix=folder_path)

In [3]:
dfs = []

for blob in blobs:
    csv_file = blob.download_as_string()
    df = pd.read_csv(io.BytesIO(csv_file), encoding='utf-8')
    dfs.append(df)

df_concat = pd.concat(dfs)

In [4]:
df_concat.reset_index(drop=True, inplace=True)
df_concat

Unnamed: 0,video_id,channel_id,published_at,title,description,tags,category_id,duration,caption,licensed_content,default_language,content_rating,view_count,like_count,favourite_count,comment_count,extraction_date
0,LciDiBeBCyY,UCteRPiisgIoHtMgqHegpWAQ,2023-12-15T15:00:16Z,I Built Python & Data Analyst Custom ChatGPT f...,Get your Excel data visualization template (fr...,"['data science', 'data scientist', 'self-taugh...",28,PT12M,True,True,en-US,{},1285,94,0,10,2023-12-15 23:01:00
1,56i1uBshzmA,UCJublDh2UsiIKsAE1553miw,2023-12-15T13:00:08Z,My Favorite Stack Question! | Daily Temperatur...,"leetcode, coding interview question, data stru...","['leetcode', 'coding interview question', 'dat...",27,PT58S,False,True,en,{},4971,338,0,8,2023-12-15 23:01:00
2,OPyoXx0yA0I,UCVhQ2NnY5Rskt6UjCUkJ_DA,2023-12-15T16:00:16Z,Requests vs Httpx vs Aiohttp | Which One to Pick?,"Exploring API communication in your app, consi...","['requests vs httpx', 'httpx', 'python request...",27,PT15M11S,True,True,en-US,{},4912,456,0,27,2023-12-15 23:01:00
3,WjhKxXCwFZA,UCzL_0nIe8B4-7ShhVPfJkgw,2023-12-15T18:47:11Z,Data Analytics and Generative AI,Generative AI is a rapidly evolving field with...,"['data analytics', 'generative ai', 'large lan...",27,PT55M18S,True,False,en-US,{},52,1,0,0,2023-12-15 23:01:00
4,NI1Psgs1tyI,UCzL_0nIe8B4-7ShhVPfJkgw,2023-12-15T16:35:30Z,Enterprise LLM Applications - Not Just a Techn...,Generative Al and Large Language Models have t...,[],24,P0D,False,False,en-US,{},0,0,0,0,2023-12-15 23:01:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7488,Ryj174SB8Uk,UCzL_0nIe8B4-7ShhVPfJkgw,2024-02-06T22:19:15Z,The Battle of Giants: Causal AI vs NLP,With over a dozen new papers accepted at NeurI...,[],24,P0D,False,False,en-US,{},0,0,0,0,2024-02-06 23:00:48
7489,4WDmkZ3uTfY,UCzL_0nIe8B4-7ShhVPfJkgw,2024-02-06T21:31:10Z,Data Science Dojo Information Session,Are you ready to start building large language...,[],24,P0D,False,False,en-US,{},0,0,0,0,2024-02-06 23:00:48
7490,VDWHijujgmU,UC79Gv3mYp6zKiSwYemEik9A,2024-02-06T12:43:35Z,How DataCamp Portfolios Will Help Your Career,,[],24,P0D,False,False,,{},0,0,0,0,2024-02-06 23:00:48
7491,KlAKAarfLRQ,UCDybamfye5An6p-j1t2YMsg,2024-02-06T15:18:59Z,TABLEAU PORTFOLIO PROJECT | End-To-End Data Vi...,🛣️ Data Roadmap ➡️ https://mochen.info/\n💼 Ult...,[],22,PT34M32S,False,True,,{},798,78,0,6,2024-02-06 23:00:48


# Cleaning Base Data Ready for Upload

In [5]:
load_dotenv()

api_key = os.getenv('API_KEY')

youtube = build('youtube', 'v3', developerKey=api_key)

In [6]:
def clean(data):

    # Dealing with Nulls
     # String Columns
    data.title.fillna('', inplace=True)
    data.description.fillna('', inplace=True)

     # Count Columns
    data.comment_count.fillna(0, inplace=True)
    data.view_count.fillna(0, inplace=True)
    data.like_count.fillna(0, inplace=True)
    
    # Columns to String
    columns_to_str = ['video_id', 'channel_id', 'title', 'description']

    for column in columns_to_str:
        data[column] = data[column].astype('string')

    # Object to Int
    columns_to_int = ['view_count', 'like_count', 'comment_count']

    for column in columns_to_int:
        data[column] = data[column].astype('int64')

    # YouTube Categories
    def get_categories(youtube, wanted_categories):

        all_data = []
        
        request = youtube.videoCategories().list(part='snippet', id=','.join(wanted_categories))
        
        response = request.execute()
        
        for i in range(len(response['items'])):
            data = dict(category_id = response['items'][i]['id'],
                    category= response['items'][i]['snippet']['title'])
            all_data.append(data)

        return all_data
    
    data['category_id'] = data['category_id'].astype('str')
    category_ids = data['category_id'].unique()
    category_dict = get_categories(youtube, category_ids)
    category_df = pd.DataFrame(category_dict)
    data = pd.merge(data, category_df, on='category_id', how='left')

    # Extracting Duration
    data['duration'] = data['duration'].str.replace('PT', '')
    def format_duration(time_string):

        # store hours, minutes, seconds as integers
        H = 0
        M = 0
        S = 0

        # check if vid time contains hours, minutes and/or seconds
        if 'H' in time_string:
            H += int(time_string.split('H')[0])
        if 'M' in time_string:
            M += int(time_string.split('M')[0].split('H')[-1])
        if 'S' in time_string:
            S += int(time_string.split('S')[0].split('M')[-1].split('H')[-1])
        
        formatted_time = timedelta(hours=H, minutes=M, seconds=S)

        return formatted_time

    data['duration_formatted'] = data['duration'].apply(format_duration)

    # Object to Bool
    columns_to_bool = ['caption', 'licensed_content']

    for column in columns_to_bool:
        data[column] = data[column].astype('bool')

    # Creating No. of Tags Column
    data['no_of_tags'] = data['tags'].apply(lambda x: len(set(x)))

    # Creating Title Length Column
    data['title_length'] = data['title'].apply(len)

    # Creating Description Length Column
    data['description_length'] = data['description'].apply(len)

    # Dropping Columns
    columns_to_drop = ['category_id', 'duration', 'content_rating', 'default_language', 'favourite_count']
    data.drop(columns=columns_to_drop, inplace=True)

    return data

In [7]:
running_df = clean(df_concat)
running_df.head()

Unnamed: 0,video_id,channel_id,published_at,title,description,tags,caption,licensed_content,view_count,like_count,comment_count,extraction_date,category,duration_formatted,no_of_tags,title_length,description_length
0,LciDiBeBCyY,UCteRPiisgIoHtMgqHegpWAQ,2023-12-15T15:00:16Z,I Built Python & Data Analyst Custom ChatGPT f...,Get your Excel data visualization template (fr...,"['data science', 'data scientist', 'self-taugh...",True,True,1285,94,10,2023-12-15 23:01:00,Science & Technology,0 days 00:12:00,37,62,4194
1,56i1uBshzmA,UCJublDh2UsiIKsAE1553miw,2023-12-15T13:00:08Z,My Favorite Stack Question! | Daily Temperatur...,"leetcode, coding interview question, data stru...","['leetcode', 'coding interview question', 'dat...",False,True,4971,338,8,2023-12-15 23:01:00,Education,0 days 00:00:58,26,63,91
2,OPyoXx0yA0I,UCVhQ2NnY5Rskt6UjCUkJ_DA,2023-12-15T16:00:16Z,Requests vs Httpx vs Aiohttp | Which One to Pick?,"Exploring API communication in your app, consi...","['requests vs httpx', 'httpx', 'python request...",True,True,4912,456,27,2023-12-15 23:01:00,Education,0 days 00:15:11,24,49,1932
3,WjhKxXCwFZA,UCzL_0nIe8B4-7ShhVPfJkgw,2023-12-15T18:47:11Z,Data Analytics and Generative AI,Generative AI is a rapidly evolving field with...,"['data analytics', 'generative ai', 'large lan...",True,False,52,1,0,2023-12-15 23:01:00,Education,0 days 00:55:18,21,32,1069
4,NI1Psgs1tyI,UCzL_0nIe8B4-7ShhVPfJkgw,2023-12-15T16:35:30Z,Enterprise LLM Applications - Not Just a Techn...,Generative Al and Large Language Models have t...,[],False,False,0,0,0,2023-12-15 23:01:00,Entertainment,0 days 00:00:00,2,64,1144


In [8]:
def day_gen(data):

    # converting published at
    data['published_at_formatted'] = data['published_at'].str.replace('Z','')
    data['published_at_formatted'] = data.published_at_formatted.apply(datetime.datetime.fromisoformat)

    # converting extraction date
    data['extraction_date_formatted'] = pd.to_datetime(data['extraction_date'], format='%Y-%m-%d %H:%M:%S')

    # creating Day column
    data['Day'] =  ((data['extraction_date_formatted'] - data['published_at_formatted']).dt.days) + 1

    data.drop(columns=['published_at', 'extraction_date'], inplace=True)

    return data

In [9]:
running_df = day_gen(running_df)
running_df

Unnamed: 0,video_id,channel_id,title,description,tags,caption,licensed_content,view_count,like_count,comment_count,category,duration_formatted,no_of_tags,title_length,description_length,published_at_formatted,extraction_date_formatted,Day
0,LciDiBeBCyY,UCteRPiisgIoHtMgqHegpWAQ,I Built Python & Data Analyst Custom ChatGPT f...,Get your Excel data visualization template (fr...,"['data science', 'data scientist', 'self-taugh...",True,True,1285,94,10,Science & Technology,0 days 00:12:00,37,62,4194,2023-12-15 15:00:16,2023-12-15 23:01:00,1
1,56i1uBshzmA,UCJublDh2UsiIKsAE1553miw,My Favorite Stack Question! | Daily Temperatur...,"leetcode, coding interview question, data stru...","['leetcode', 'coding interview question', 'dat...",False,True,4971,338,8,Education,0 days 00:00:58,26,63,91,2023-12-15 13:00:08,2023-12-15 23:01:00,1
2,OPyoXx0yA0I,UCVhQ2NnY5Rskt6UjCUkJ_DA,Requests vs Httpx vs Aiohttp | Which One to Pick?,"Exploring API communication in your app, consi...","['requests vs httpx', 'httpx', 'python request...",True,True,4912,456,27,Education,0 days 00:15:11,24,49,1932,2023-12-15 16:00:16,2023-12-15 23:01:00,1
3,WjhKxXCwFZA,UCzL_0nIe8B4-7ShhVPfJkgw,Data Analytics and Generative AI,Generative AI is a rapidly evolving field with...,"['data analytics', 'generative ai', 'large lan...",True,False,52,1,0,Education,0 days 00:55:18,21,32,1069,2023-12-15 18:47:11,2023-12-15 23:01:00,1
4,NI1Psgs1tyI,UCzL_0nIe8B4-7ShhVPfJkgw,Enterprise LLM Applications - Not Just a Techn...,Generative Al and Large Language Models have t...,[],False,False,0,0,0,Entertainment,0 days 00:00:00,2,64,1144,2023-12-15 16:35:30,2023-12-15 23:01:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7488,Ryj174SB8Uk,UCzL_0nIe8B4-7ShhVPfJkgw,The Battle of Giants: Causal AI vs NLP,With over a dozen new papers accepted at NeurI...,[],False,False,0,0,0,Entertainment,0 days 00:00:00,2,38,858,2024-02-06 22:19:15,2024-02-06 23:00:48,1
7489,4WDmkZ3uTfY,UCzL_0nIe8B4-7ShhVPfJkgw,Data Science Dojo Information Session,Are you ready to start building large language...,[],False,False,0,0,0,Entertainment,0 days 00:00:00,2,37,766,2024-02-06 21:31:10,2024-02-06 23:00:48,1
7490,VDWHijujgmU,UC79Gv3mYp6zKiSwYemEik9A,How DataCamp Portfolios Will Help Your Career,,[],False,False,0,0,0,Entertainment,0 days 00:00:00,2,45,0,2024-02-06 12:43:35,2024-02-06 23:00:48,1
7491,KlAKAarfLRQ,UCDybamfye5An6p-j1t2YMsg,TABLEAU PORTFOLIO PROJECT | End-To-End Data Vi...,🛣️ Data Roadmap ➡️ https://mochen.info/ 💼 Ulti...,[],False,True,798,78,6,People & Blogs,0 days 00:34:32,2,99,2516,2024-02-06 15:18:59,2024-02-06 23:00:48,1


# Uploading to Cloud

In [10]:
running_folder_path = os.getenv('RUNNING_DATA_FOLDER')

In [11]:
bucket = client.get_bucket(bucket_name)
csv_string = running_df.to_csv(index=False)

In [12]:
blob = bucket.blob(f'{running_folder_path}running_youtube_video_data.csv')
blob.upload_from_string(csv_string, content_type='text/csv')

# Sanity Check

In [13]:
checking_df = pd.read_csv('YouTube_Running_Data_running_youtube_video_data.csv')
checking_df.head()

Unnamed: 0,video_id,channel_id,title,description,tags,caption,licensed_content,view_count,like_count,comment_count,category,duration_formatted,no_of_tags,title_length,description_length,published_at_formatted,extraction_date_formatted,Day
0,LciDiBeBCyY,UCteRPiisgIoHtMgqHegpWAQ,I Built Python & Data Analyst Custom ChatGPT f...,Get your Excel data visualization template (fr...,"['data science', 'data scientist', 'self-taugh...",True,True,1285,94,10,Science & Technology,0 days 00:12:00,37,62,4194,2023-12-15 15:00:16,2023-12-15 23:01:00,1
1,56i1uBshzmA,UCJublDh2UsiIKsAE1553miw,My Favorite Stack Question! | Daily Temperatur...,"leetcode, coding interview question, data stru...","['leetcode', 'coding interview question', 'dat...",False,True,4971,338,8,Education,0 days 00:00:58,26,63,91,2023-12-15 13:00:08,2023-12-15 23:01:00,1
2,OPyoXx0yA0I,UCVhQ2NnY5Rskt6UjCUkJ_DA,Requests vs Httpx vs Aiohttp | Which One to Pick?,"Exploring API communication in your app, consi...","['requests vs httpx', 'httpx', 'python request...",True,True,4912,456,27,Education,0 days 00:15:11,24,49,1932,2023-12-15 16:00:16,2023-12-15 23:01:00,1
3,WjhKxXCwFZA,UCzL_0nIe8B4-7ShhVPfJkgw,Data Analytics and Generative AI,Generative AI is a rapidly evolving field with...,"['data analytics', 'generative ai', 'large lan...",True,False,52,1,0,Education,0 days 00:55:18,21,32,1069,2023-12-15 18:47:11,2023-12-15 23:01:00,1
4,NI1Psgs1tyI,UCzL_0nIe8B4-7ShhVPfJkgw,Enterprise LLM Applications - Not Just a Techn...,Generative Al and Large Language Models have t...,[],False,False,0,0,0,Entertainment,0 days 00:00:00,2,64,1144,2023-12-15 16:35:30,2023-12-15 23:01:00,1


In [14]:
print(running_df.shape)
print(checking_df.shape)

(7493, 18)
(7493, 18)
