In [23]:
import pandas as pd
import numpy as np
import json 
import requests
import os
import sys
import pycountry
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data_module import read_processed_df

# Create channel_df

In [24]:
videos_df = read_processed_df.read_video_df()
videos_df.head()

Unnamed: 0,video_id,title,published,view_count,like_count,comment_count,duration,definition,tags,default_audio_language,madeforkid,channelTitle
0,hh6gE0LxfO8,"#5 Machine Learning Specialization [Course 1, ...",2022-12-01 13:21:30+00:00,30742,319.0,6.0,0 days 00:07:17,hd,,en,False,DeepLearningAI
1,_i3aqgKVNQI,C5W3L01 Basic Models,2018-02-05 19:50:14+00:00,110835,436.0,12.0,0 days 00:06:19,hd,,en,False,DeepLearningAI
2,uvTL1N02f04,"#23 Machine Learning Specialization [Course 1,...",2022-12-01 13:22:38+00:00,11639,107.0,4.0,0 days 00:06:53,hd,,en,False,DeepLearningAI
3,H4YK_7MAckk,ChatGPT Prompt Engineering for Developers: A s...,2023-04-27 14:49:09+00:00,143057,2847.0,121.0,0 days 00:01:58,hd,"[ChatGPT, prompt engineering, openai, deeplear...",en,False,DeepLearningAI
4,YVtP5UGdgXg,"#25 Machine Learning Specialization [Course 1,...",2022-12-01 13:22:45+00:00,11542,106.0,1.0,0 days 00:06:35,hd,,en,False,DeepLearningAI


In [25]:
channel_df = pd.DataFrame(columns=['channel_name', 'video_count', 'subscriber_count', 'view_count', 'like_count','comment_count', 'join_date', 'country', 'description', 'link'])

for channel in videos_df.channelTitle.unique():
    name = channel
    temp_df = videos_df[videos_df.channelTitle == channel]
    # Drop duplicates video id
    temp_df = temp_df.drop_duplicates(subset='video_id')
    like_count = temp_df.like_count.sum()
    video_count = temp_df.video_id.nunique()
    view_count = temp_df.view_count.sum()
    comment_count = temp_df.comment_count.sum()

    channel_df = pd.concat([channel_df, pd.DataFrame([[name, video_count, view_count, like_count,comment_count]], columns=['channel_name', 'video_count', 'view_count', 'like_count', 'comment_count'])])



Because there are some missing information in the video_df, we need to request them from the YouTube API.

In [26]:
api_key = json.load(open('../data/external/API.json', 'r'))['Nhat']
channel_id_nam = json.load(open('../data/external/channel_id_nam.json', 'r'))
channel_id_Phuc = json.load(open('../data/external/channel_id_Phuc.json', 'r'))
channel_ids = {**channel_id_nam, **channel_id_Phuc}

def get_country_name(country_code):
    try:
        country_name = pycountry.countries.get(alpha_2=country_code).name
    except:
        country_name = 'Not available'
    return country_name

def getChannelInfo(channel_id):
    url = 'https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics%2CbrandingSettings&id={}&key={}'.format(channel_id, api_key)
    response = requests.get(url)
    data = response.json()
    # Extract the required information
    subscriber_count = data['items'][0]['statistics']['subscriberCount']
    join_date = data['items'][0]['snippet']['publishedAt']
    country_code = data['items'][0]['snippet'].get('country', 'Not available')  # Some channels might not have a 'country' field
    link = "https://www.youtube.com/channel/" + data['items'][0]['id']
    description = data['items'][0]['snippet']['description']
    country = get_country_name(country_code)
    return subscriber_count, join_date, country, link, description


In [27]:

for channel in channel_df.channel_name:
    channel_id = channel_ids[channel]
    subscriber_count, join_date, country, link, description = getChannelInfo(channel_id)
    channel_df.loc[channel_df.channel_name == channel, ['subscriber_count', 'join_date', 'country', 'link', 'description']] = subscriber_count, join_date, country, link,description
channel_df

Unnamed: 0,channel_name,video_count,subscriber_count,view_count,like_count,comment_count,join_date,country,description,link
0,DeepLearningAI,319,272000,11306319,130976.0,4322.0,2017-08-22T22:27:58Z,Not available,Welcome to the official DeepLearning.AI YouTub...,https://www.youtube.com/channel/UCcIXc5mJsHVYT...
0,3Blue1Brown,155,5760000,420677024,11791180.0,377602.0,2015-03-03T23:11:55Z,United States,My name is Grant Sanderson. Videos here cover ...,https://www.youtube.com/channel/UCYO_jab_esuFR...
0,Joma Tech,95,2270000,178760989,6481158.0,202994.0,2016-08-31T22:22:07Z,United States,I talk about life in Silicon Valley and at big...,https://www.youtube.com/channel/UCV0qA-eDDICsR...
0,DataCamp,437,155000,12866441,17380.0,680.0,2014-03-25T14:48:01Z,United States,Welcome to DataCamp's official YouTube channel...,https://www.youtube.com/channel/UC79Gv3mYp6zKi...
0,CS Dojo,95,1920000,76016992,1879873.0,82131.0,2016-02-26T01:49:30Z,Canada,"Hello! My name is YK, and I usually make video...",https://www.youtube.com/channel/UCxX9wt5FWQUAA...
0,sentdex,501,1280000,60203871,847653.0,76757.0,2012-12-16T20:49:11Z,United States,"Python Programming tutorials, going further th...",https://www.youtube.com/channel/UCfzlCWGWYyIQ0...
0,StatQuest with Josh Starmer,263,1060000,56419479,1244841.0,85855.0,2011-05-24T01:52:48Z,United States,"Statistics, Machine Learning and Data Science ...",https://www.youtube.com/channel/UCtYLUTtgS3k1F...
0,Tech With Tim,464,1400000,76914763,2185506.0,78572.0,2014-04-23T01:57:10Z,Canada,"Learn programming, software engineering, machi...",https://www.youtube.com/channel/UC4JX40jDee_tI...
0,365 Data Science,222,304000,13610599,269761.0,7674.0,2017-08-07T15:17:05Z,Bulgaria,At 365 Data Science we make #DataScience acces...,https://www.youtube.com/channel/UCEBpSZhI1X8Wa...
0,Data Professor,286,172000,5077282,127798.0,12465.0,2019-08-17T15:59:56Z,Thailand,"Data Science, Machine Learning, Bioinformatics...",https://www.youtube.com/channel/UCV8e2g4IWQqK7...


In [28]:
# Save to csv
channel_df.to_csv('../data/processed/df_channels_processed.csv', index=False)