In [None]:
#import main libraries
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import os
import time
import re
from tqdm import tqdm

In [None]:
#load api key from .env
_=load_dotenv()
api_key = os.environ['api_key']

In [None]:
#set up the credentials and youtube connection

youtube = build('youtube', 'v3', developerKey = api_key)

In [None]:
#create function to get channel ids
def get_channel_ids(youtube, channel_names):
    unique_channel_ids = {}  # Dictionary to store channel names and IDs

    for channel_name in channel_names:
        # Search for the channel using the channel name
        search_response = youtube.search().list(
            q=channel_name,
            part='id',
            type='channel',
            maxResults=1
        ).execute()

        # Extract the channel ID from the search results
        channel_id = search_response['items'][0]['id']['channelId']
        unique_channel_ids[channel_name] = channel_id

        time.sleep(5)
        
    return unique_channel_ids

In [None]:
# List of channel names

channel_names = ['blockchaindailynews', 'brainbrocrypto', 'cryptoMOC',
               'GiantCutie-CH', 'grenadetw', 'desmondcrypto', 
               'cryptoalvin0617', 'mrblocktw', 'BonnieBlockchain',
               'skywee97', 'shuqinbtc', 'CakeBaBa',
               'Web3TV_0xTrade', 'goldenrichacademy', 'GongYouChai',
               'youtubercrypto', 'ajgameficlub', 'alfred.blockfinance',
               'xiao_lin_shuo', 'AhJu']

# Get the channel IDs
unique_channel_ids = get_channel_ids(youtube, channel_names)

In [None]:
# Print the channel IDs
print(unique_channel_ids)

In [None]:
df = pd.DataFrame(list(unique_channel_ids.items()), columns=['Channel Name', 'Channel ID'])
df.head()

In [None]:
df.to_csv('ChannelIDs.csv')

In [None]:
### loading from csv for testing purpose only ###

channel_df = pd.read_csv('ChannelIDs.csv')
channel_df.head()

In [None]:
### creating list from csv for testing purpose only ###
unique_channel_ids = channel_df['Channel ID'].tolist()
unique_channel_ids

In [None]:
#creating a function to extract channel statistics
def get_channel_stats(youtube, unique_channel_ids):
    all_channel_data = []
    for i in range(len(unique_channel_ids)):
        request = youtube.channels().list(
            part = 'snippet, contentDetails, statistics',
            id = unique_channel_ids)
        response = request.execute()

        for i in range(len(response['items'])):
            channel_data = dict(Channel_name = response['items'][i]['snippet']['title'], 
                            Channel_playlist =  response['items'][i]['contentDetails']['relatedPlaylists']['uploads'],
                            Subscriber_count = response['items'][i]['statistics']['subscriberCount'],
                            Video_count = response['items'][i]['statistics']['videoCount'])
            all_channel_data.append(channel_data)
            
        return (all_channel_data)

In [None]:
channel_database= get_channel_stats(youtube, unique_channel_ids)
channel_database_df = pd.DataFrame(channel_database)
channel_database_df

In [None]:
channel_database_df.to_csv('Channel_database',sep='\t', index=True,header=True)

In [None]:
#getting playlist containing the entire video_ids of all channels
channel_playlists = list(channel_database_df['Channel_playlist'])
channel_playlists

In [None]:
channel_database_df.to_csv('Channel_database',sep='\t', index=True,header=True)

In [None]:
#creating a function to extract all the videos from the channels
def get_all_videos(playlists):
    all_video_ids= []
    for playlist_id in playlists:
        playlist_videos = []
        next_page_token = None

        while True:
            request = youtube.playlistItems().list(
                part = 'contentDetails',
                playlistId = playlist_id,
                maxResults = 50,
                pageToken=next_page_token
            )                
            response = request.execute()

            for i in range(len(response['items'])):
                video_id = response['items'][i]['contentDetails']['videoId']
                playlist_videos.append(video_id)
                    
            next_page_token = response.get('nextPageToken')               
            if not next_page_token:
                 break  

        all_video_ids.append(playlist_videos)
    return all_video_ids

In [None]:
video_list= list(get_all_videos(channel_playlists))
video_database = list(np.concatenate(video_list))
video_database

In [None]:
video_database_df = pd.Series(video_list)
video_database_df

In [None]:
video_database_df.to_csv('Video_database',sep='\t', index=True,header=False)

In [None]:
#creating a function to extract video data from video ids
def get_video_details(youtube,videos):
    video_details = []
    for items in videos:
        page_token = None

        while True:
            request = youtube.videos().list(
                part = 'snippet, contentDetails, statistics',
                id = items,
                maxResults = 50,
                pageToken = page_token
            )
            response = request.execute()

            video_details.extend(response['items'])
            page_token = response.get('nextPageToken')

            if not page_token:
                break

        all_details = {
            'descriptions': [],
            'titles': [],
            'view_counts': [],
            'channel_ids': [],
            'publish_dates': [],
            'tags': [],
            'thumbnails': []
        }
    
        for video in video_details:
            video_id = video['id']
            snippet = video['snippet']
            statistics = video['statistics']

            description = snippet.get('description', '')
            title = snippet.get('title', '')
            view_count = statistics.get('viewCount', '')
        #like_count = statistics.get('likeCount', '')
        #dislike_count = statistics.get('dislikeCount', '')
            channel_id = snippet.get('channelId', '')
            publish_date = snippet.get('publishedAt', '')
            tag = snippet.get('tags','')
            thumbnail = snippet['thumbnails'].get('default', '')

            all_details['descriptions'].append(description)
            all_details['titles'].append(title)
            all_details['view_counts'].append(view_count)
            all_details['channel_ids'].append(channel_id)
            all_details['publish_dates'].append(publish_date)
            all_details['tags'].append(tag)
            all_details['thumbnails'].append(thumbnail)

    return all_details

In [None]:
get_video_details(youtube,video_database)

In [None]:
video_data = get_video_details(youtube,video_database)
video_data_df = pd.DataFrame(video_data)
video_data_df.head()

In [None]:
### load data from CSV for testing purpose only###

data_df = pd.read_csv('video_details.csv')

In [None]:
#import translate libraries
from googletrans import Translator

In [None]:
#creating a translate function

def translate(text):
    time.sleep(1)
    translator = Translator()
    translation = translator.translate(text, src='zh-TW', dest='en')
    return translation.text

In [None]:
data_df.head()

In [None]:
#testing on all df columns 
columns_to_translate = ['tags', 'descriptions','titles']
start_time = time.time() 
for column in columns_to_translate:
    data_df[f'translated {column}'] = data_df[column].apply(translate)

end_time = time.time() 
total_time = end_time - start_time 
data_df.head()

In [None]:
#since it took too long, try on a smaller copy
df2=data_df.head(20).copy()
df2.head()

In [None]:
#test on df2
columns_to_translate = ['tags', 'descriptions','titles']
start_time = time.time() 

for column in columns_to_translate:
    df2[f'translated {column}'] = df2[column].apply(translate)

end_time = time.time() 
total_time = end_time - start_time 

df2.head()

In [None]:
#time for translating 20 entries
print(total_time)

In [None]:
#total size of DF
len(data_df)

In [None]:
#theortical total time for translation. 
88*7085/60/60

In [None]:
#trying a batch

columns_to_translate = ['tags', 'descriptions','titles']

#translate a batch of text
def translate_batch(batch):
    return batch.map(translate)

batch_size = 100
total_rows = len(df)

start_time = time.time()

#terate in batches
for i in range(0, total_rows, batch_size):
    batch = df.loc[i:i+batch_size-1, columns_to_translate]
    
#translate the batch
    translated_batch = translate_batch(batch)
    
#put values back in df
    for column in columns_to_translate:
        translated_column = f'translated {column}'
        df.loc[i:i+batch_size-1, translated_column] = translated_batch[column]
    
#sleep break    
    time.sleep(1)  


end_time = time.time()
total_time = end_time - start_time


    
# Display the resulting DataFrame
df.head()