In [68]:
import pandas as pd
import numpy as np
import json 
import requests
import os
import sys
import pycountry
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data_module import read_processed_df

# Repair channel_df

In [69]:
videos_df = read_processed_df.read_video_df()
videos_df.head()

Unnamed: 0,video_id,title,published,view_count,like_count,comment_count,duration,definition,tags,default_audio_language,madeforkid,channelTitle
0,hh6gE0LxfO8,"#5 Machine Learning Specialization [Course 1, ...",2022-12-01 13:21:30+00:00,30742,319.0,6.0,0 days 00:07:17,hd,,en,False,DeepLearningAI
1,_i3aqgKVNQI,C5W3L01 Basic Models,2018-02-05 19:50:14+00:00,110835,436.0,12.0,0 days 00:06:19,hd,,en,False,DeepLearningAI
2,uvTL1N02f04,"#23 Machine Learning Specialization [Course 1,...",2022-12-01 13:22:38+00:00,11639,107.0,4.0,0 days 00:06:53,hd,,en,False,DeepLearningAI
3,H4YK_7MAckk,ChatGPT Prompt Engineering for Developers: A s...,2023-04-27 14:49:09+00:00,143057,2847.0,121.0,0 days 00:01:58,hd,"[ChatGPT, prompt engineering, openai, deeplear...",en,False,DeepLearningAI
4,YVtP5UGdgXg,"#25 Machine Learning Specialization [Course 1,...",2022-12-01 13:22:45+00:00,11542,106.0,1.0,0 days 00:06:35,hd,,en,False,DeepLearningAI


In [70]:
channel_df = pd.DataFrame(columns=['channel_name', 'video_count', 'subscriber_count', 'view_count', 'like_count','comment_count', 'join_date', 'country', 'description', 'link'])

for channel in videos_df.channelTitle.unique():
    name = channel
    temp_df = videos_df[videos_df.channelTitle == channel]
    # Drop duplicates video id
    temp_df = temp_df.drop_duplicates(subset='video_id')
    like_count = temp_df.like_count.sum()
    video_count = temp_df.video_id.nunique()
    view_count = temp_df.view_count.sum()
    comment_count = temp_df.comment_count.sum()

    channel_df = pd.concat([channel_df, pd.DataFrame([[name, video_count, view_count, like_count,comment_count]], columns=['channel_name', 'video_count', 'view_count', 'like_count', 'comment_count'])])



Because there are some missing information in the video_df, we need to request them from the YouTube API.

In [71]:
api_key = json.load(open('../data/external/API.json', 'r'))['Nhat']
channel_id_nam = json.load(open('../data/external/channel_id_nam.json', 'r'))
channel_id_Phuc = json.load(open('../data/external/channel_id_Phuc.json', 'r'))
channel_ids = {**channel_id_nam, **channel_id_Phuc}

def get_country_name(country_code):
    try:
        country_name = pycountry.countries.get(alpha_2=country_code).name
    except:
        country_name = 'Not available'
    return country_name

def getChannelInfo(channel_id):
    url = 'https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics%2CbrandingSettings&id={}&key={}'.format(channel_id, api_key)
    response = requests.get(url)
    data = response.json()
    # Extract the required information
    subscriber_count = data['items'][0]['statistics']['subscriberCount']
    join_date = data['items'][0]['snippet']['publishedAt']
    country_code = data['items'][0]['snippet'].get('country', 'Not available')  # Some channels might not have a 'country' field
    video_count = data['items'][0]['statistics']['videoCount']
    view_count = data['items'][0]['statistics']['viewCount']
    link = "https://www.youtube.com/channel/" + data['items'][0]['id']
    description = data['items'][0]['snippet']['description']
    country = get_country_name(country_code)
    return subscriber_count, join_date, country, link, description, int(video_count), int(view_count)


In [72]:
check_df = channel_df.copy()
for channel in channel_df.channel_name:
    channel_id = channel_ids[channel]
    subscriber_count, join_date, country, link, description, video_count, view_count = getChannelInfo(channel_id)
    check_df.loc[check_df.channel_name == channel, 'video_count'] -= video_count
    check_df.loc[check_df.channel_name == channel, 'view_count'] -= view_count
    channel_df.loc[channel_df.channel_name == channel, ['subscriber_count', 'join_date', 'country', 'link', 'description']] = subscriber_count, join_date, country, link,description
check_df

Unnamed: 0,channel_name,video_count,subscriber_count,view_count,like_count,comment_count,join_date,country,description,link
0,DeepLearningAI,-79,,-5173423,130976.0,4322.0,,,,
0,3Blue1Brown,-5,,-3289841,11791175.0,377602.0,,,,
0,Joma Tech,-3,,-308625,6481158.0,202994.0,,,,
0,DataCamp,-1102,,-10202623,17380.0,680.0,,,,
0,CS Dojo,-9,,-4209421,1879873.0,82131.0,,,,
0,sentdex,-750,,-52731391,847653.0,76757.0,,,,
0,StatQuest with Josh Starmer,-3,,-1521511,1244841.0,85855.0,,,,
0,Tech With Tim,-482,,-55052254,2185506.0,78572.0,,,,
0,365 Data Science,-1,,-2953,269761.0,7674.0,,,,
0,Data Professor,-39,,-478944,127798.0,12465.0,,,,


In [73]:
# Save to csv
channel_df.to_csv('../data/processed/df_channels_processed.csv', index=False)