# Importing Libraries

In [10]:
# Importing Packages
import pandas as pd
from google.cloud import storage
import os
from dotenv import load_dotenv
import io
from googleapiclient.discovery import build
from datetime import timedelta, datetime
import datetime
import io

In [11]:
load_dotenv()

api_key = os.getenv('API_KEY')

youtube = build('youtube', 'v3', developerKey=api_key)

In [12]:
# Variables
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
bucket_name = os.getenv('BUCKET_NAME')
folder_path = os.getenv('FOLDER_PATH')
project_name = os.getenv('PROJECT_NAME')
running_data_folder = os.getenv('RUNNING_DATA_FOLDER')
client = storage.Client()
blobs = client.list_blobs(bucket_name, prefix=folder_path)

In [13]:
def get_channel_stats(youtube_key, channel_ids):

    all_data = []

    request = youtube_key.channels().list(part='snippet,contentDetails,statistics', id=','.join(channel_ids))

    response = request.execute()

    for i in range(len(response['items'])):
        data = dict(channel_name = response['items'][i]['snippet']['title'],
                channel_id = response['items'][i]['id'],
                subscribers = response['items'][i]['statistics']['subscriberCount'],
                total_views = response['items'][i]['statistics']['viewCount'],
                total_videos = response['items'][i]['statistics']['videoCount'],
                playlist_id = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    channel_stats_df = pd.DataFrame(all_data)

    return channel_stats_df

In [14]:
def database_creator(data, days):

    # Filter by requested days

    filter_list = list(data[data['Day'] == int(days)]['video_id'])
    filtered_df = data[data['video_id'].isin(filter_list)]
    filtered_df = filtered_df[filtered_df['Day'] <= int(days)]

    # Flip
    filtered_df['Day'] = 'Day' + filtered_df['Day'].astype(str)
    df_pivoted = filtered_df.pivot_table(index='video_id', columns='Day', values=['view_count', 'like_count', 'comment_count'],
                                         aggfunc='first', fill_value=0).reset_index()

    columns_to_keep = ['video_id', 'channel_id', 'title', 'description', 'tags', 'caption',
        'licensed_content','category', 'duration_formatted',
        'no_of_tags', 'title_length', 'description_length',
        'published_at_formatted', 'extraction_date_formatted']
    df_unique_ids = filtered_df[columns_to_keep].copy()

    df_unique_ids.drop_duplicates(subset='video_id', keep='first', inplace=True)

    df_unique_ids.reset_index(drop=True, inplace=True)

    # Merge with the original DataFrame to include other information
    result_df = df_pivoted.merge(df_unique_ids, on='video_id', how='left')
    result_df.drop(columns=[('video_id', '')], inplace=True)


    # get list of channel ids of the videos
    channel_ids_list = list(set(result_df['channel_id']))
    
    # get channel stats dataframe
    channel_df = get_channel_stats(youtube, channel_ids_list)

    # merge final dataframe
    final_df = pd.merge(result_df, channel_df, on='channel_id', how='left')

    return final_df

In [15]:
def get_running_df():

    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob('YouTube_Running_Data/running_youtube_video_data.csv')
    running_csv = blob.download_as_string()
    running_df = pd.read_csv(io.BytesIO(running_csv), encoding='utf-8')

    return running_df

In [16]:
def interface_function():
    
    days = input('How many days data would you like? ')

    df = get_running_df()

    new_df = database_creator(df, days)

    return new_df

In [17]:
# adjust display options to show all columns
pd.set_option('display.max_columns', None)

In [18]:
database = interface_function() # tested function to return videos that have data across 3 days
database

  result_df = df_pivoted.merge(df_unique_ids, on='video_id', how='left')


Unnamed: 0,video_id,"(comment_count, Day1)","(comment_count, Day2)","(comment_count, Day3)","(like_count, Day1)","(like_count, Day2)","(like_count, Day3)","(view_count, Day1)","(view_count, Day2)","(view_count, Day3)",channel_id,title,description,tags,caption,licensed_content,category,duration_formatted,no_of_tags,title_length,description_length,published_at_formatted,extraction_date_formatted,channel_name,subscribers,total_views,total_videos,playlist_id
0,-4xVbmIzGfE,31,40,47,1026,1463,1767,14035,22114,29240,UCJublDh2UsiIKsAE1553miw,Very Common FAANG Interview Question! | Index ...,"dynamic programming, leetcode, coding intervie...","['leetcode', 'coding interview question', 'dat...",False,True,Education,0 days 00:00:55,26,95,112,2024-02-03 02:30:00,2024-02-03 23:00:46,Greg Hogg,85600,9407006,640,UUJublDh2UsiIKsAE1553miw
1,-PAq8nh5txE,37,43,46,680,762,820,17862,21191,23918,UCNU_lfiiWBdtULKOw6X0Dig,In this Mass Layoff Becoming Recession proof-I...,Becoming Recession proof,[],False,True,Education,0 days 00:03:54,2,61,24,2024-02-03 02:47:19,2024-02-03 23:00:46,Krish Naik,888000,90296973,1813,UUNU_lfiiWBdtULKOw6X0Dig
2,-Tv1T40Kypw,8,16,16,24,50,59,306,927,1145,UCw_LFe2pS8x3NyipGNJgeEA,DataCamp Review - Is It Worth It? (2024),👉Try DataCamp for free:\nhttps://bit.ly/3HWXK8...,"['data science', 'data analytics', 'data analy...",False,True,Science & Technology,0 days 00:07:49,31,40,383,2024-02-09 18:17:35,2024-02-09 23:00:46,Learn with Lukas,33600,1886454,74,UUw_LFe2pS8x3NyipGNJgeEA
3,-f1o7i0ExhM,10,19,22,626,1065,1303,6644,13297,16866,UCJublDh2UsiIKsAE1553miw,VERY COMMON FAANG INTERVIEW QUESTION | Leetcod...,"dynamic programming, leetcode, coding intervie...","['leetcode', 'coding interview question', 'dat...",False,True,Education,0 days 00:00:58,26,67,112,2024-01-23 12:25:48,2024-01-23 23:00:42,Greg Hogg,85600,9407006,640,UUJublDh2UsiIKsAE1553miw
4,-ny5_RSMV6k,37,47,52,458,551,591,6256,8328,9082,UCNU_lfiiWBdtULKOw6X0Dig,End To End Multi Language Invoice Extractor Pr...,Join me in this exciting video as we develop a...,"['yt:cc=on', 'machine learnign tutorials', 'in...",False,True,Education,0 days 00:25:59,28,90,2414,2023-12-27 04:41:32,2023-12-27 23:00:42,Krish Naik,888000,90296973,1813,UUNU_lfiiWBdtULKOw6X0Dig
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,z4u_fts6yfE,0,0,0,2,3,3,58,74,72,UC79Gv3mYp6zKiSwYemEik9A,#178 Making SMARTER Decisions with Lori Silver...,We don’t think about every decision we make. S...,"['podcast', 'data', 'ai', 'dataframed', 'lori ...",False,False,Entertainment,0 days 01:07:45,24,94,3271,2024-02-01 09:58:37,2024-02-01 23:00:45,DataCamp,158000,24138315,1582,UU79Gv3mYp6zKiSwYemEik9A
304,zNPrifp7djc,0,0,0,0,0,0,0,0,0,UC79Gv3mYp6zKiSwYemEik9A,Get Started Analyzing Survey Data with SQL & P...,,[],False,False,Entertainment,0 days 00:00:00,2,51,0,2024-01-16 12:38:53,2024-01-16 23:00:41,DataCamp,158000,24138315,1582,UU79Gv3mYp6zKiSwYemEik9A
305,zT-cc1IblsQ,12,14,14,192,279,296,3546,11346,14669,UCNU_lfiiWBdtULKOw6X0Dig,Complete MLOPS Platform To Build LLMs Applicat...,PostgresML is a complete MLOps platform in a P...,"['yt::cc=on', 'postgresml', 'postgresml tutori...",False,True,Education,0 days 00:17:37,25,94,2086,2023-12-18 14:51:44,2023-12-18 23:00:52,Krish Naik,888000,90296973,1813,UUNU_lfiiWBdtULKOw6X0Dig
306,zee_QlhO1kQ,0,0,0,1,2,4,0,0,0,UC79Gv3mYp6zKiSwYemEik9A,Data & AI Trends & Predictions 2024,,[],False,False,Entertainment,0 days 00:00:00,2,35,0,2024-01-08 12:00:11,2024-01-08 23:00:54,DataCamp,158000,24138315,1582,UU79Gv3mYp6zKiSwYemEik9A
