In [15]:
# !pip install --upgrade google-api-python-client

In [16]:
# !pip install --upgrade google-auth-oauthlib google-auth-httplib2

In [17]:
import sys
sys.path.append('../config') 

from config import YOUTUBE_API_KEY

from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON
import urllib.request
import re

Los proyectos que habilitan la API de datos de YouTube tienen una asignación de cuota predeterminada de 10,000 unidades por día, una cantidad suficiente para la gran mayoría de los usuarios de la API. La cuota predeterminada, que está sujeta a cambios, nos ayuda a optimizar las asignaciones de cuotas y a escalar nuestra infraestructura de una manera que sea más significativa para nuestros usuarios de API. Puedes ver el uso de tu cuota en la página Cuotas en la Consola de API.

Nota: Si alcanzas el límite de cuota, puedes solicitar un aumento del cuota a través del formulario de solicitud de extensión de cuota para los servicios de la API de YouTube.

Calcula el uso de la cuota

Google calcula el uso de tu cuota mediante la asignación de un costo a cada solicitud. Los diferentes tipos de operaciones tienen diferentes costos de cuotas. Por ejemplo:

Una operación de lectura que recupera una lista de recursos (canales, videos o listas de reproducción) suele costar 1 unidad.
Una operación de escritura que crea, actualiza o borra un recurso suele tener un costo de 50 unidades.
Una solicitud de búsqueda cuesta 100 unidades.
La carga de un video cuesta 1600 unidades.

## Definición API

In [18]:
api_key = YOUTUBE_API_KEY
api_service_name = "youtube"
api_version = "v3"

youtube = build(
    api_service_name, api_version, developerKey=api_key)

## Obtencion comentarios en videos del Canal

Buscar en el codigo fuente de la pagina para obtener el ID <br>
https://www.youtube.com/channel/

### Obtención de Id del canal

In [19]:
def extract_channel_ids(urls):
    all_channels = []
    for url in urls:
        with urllib.request.urlopen(url) as response:
            html_content = response.read().decode('utf-8')

        pattern = r'https://www.youtube.com/channel/([^"]+)'
        matches = re.findall(pattern, html_content)

        all_channels.append(matches[0])

    return all_channels

In [20]:
urls = [
    'https://www.youtube.com/@IbaiLlanos',
    #More channels
]
channel_ids = extract_channel_ids(urls)

In [21]:
def get_channel_stats(youtube, channel_ids):
    all_data = []

    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
                }
        all_data.append(data)

    return (pd.DataFrame(all_data))

In [22]:
channel_stats = get_channel_stats(youtube, channel_ids)
channel_stats

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Ibai,11400000,3364911676,1370,UUaY_-ksFSQtTGk0y1HA_3YQ


### Obtencion videos del canal

In [23]:
def get_video_ids(youtube, playlist_id, limit=None, output_file="../data/youtube/yt_video_ids.txt"):
    video_ids = []
    next_page_token = True
    total_results = 0

    while next_page_token is not None and (limit is None or total_results < limit):
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults=50  # Máximo permitido por la API
        )
        response = request.execute()

        for item in response['items']:
            video_id = item['contentDetails']['videoId']
            video_ids.append(video_id)
            total_results += 1

            # Escribir el ID del video en el archivo de salida
            with open(output_file, "a") as file:
                file.write(video_id + "\n")

            if limit is not None and total_results == limit:
                break

        next_page_token = response.get('nextPageToken')

    return video_ids


In [24]:
video_ids = []
for index, row in channel_stats.iterrows():
    playlist_id = row['playlistId']
    video_ids.extend(get_video_ids(youtube, playlist_id, limit=1794))

video_ids

['MWn--jTEJ8k',
 '-QtaNl9EpPA',
 'pASX4QE-N50',
 'k70W5W5UFjU',
 '7Dy_noSGxFU',
 'x0HmLl8MVnU',
 'WZZ0q-dC0Is',
 'qPDowzeIg3E',
 'Z0B-Cu6q_2w',
 'v1V_R5uAfB0',
 '4fPh-__KygE',
 'Aqh0pSy-6lg',
 'KJw0PTs6WE8',
 'f5nHjpf7hio',
 'ibkVULaVBu4',
 'R038puMb48o',
 'L7LlWDpaqM4',
 'ipjGZyQEiBQ',
 'q_SODM8T51U',
 'YGfkGZnf7Dg',
 'tmWSSu3G71s',
 'pbI7-wv8ETc',
 '6xF48_ba3xY',
 't19hvvUa-Zk',
 'EPXwufbD_rs',
 'mITy7skhkj8',
 'b98Gxxjfowk',
 'bqlpGRW19Uw',
 'rlm6iQPZ6g4',
 'ARK93MEhZDo',
 'jFziAts2_A0',
 'XnjEOfqRuxk',
 '34j_3bB6AKY',
 'r_a-wOSo4Mk',
 'cdEc_QodJyE',
 'Bt3qhUjml9I',
 'Pp-0OCjmCVw',
 'QTy8S58MsRg',
 'pGAGjjiLpzY',
 'CcXCzFAn1wg',
 'm15GSAb-GRk',
 '4x_UYrEPbO8',
 'PKURQpqG25k',
 'tBZbah0Y-I4',
 'ASL5hhgRMfQ',
 'nfYzK0XNcLk',
 'qVaz55Uzo9o',
 'bPZaOY-nIfo',
 'vD4UXlj6jkI',
 'l8CinUZ0aAg',
 'MWn--jTEJ8k',
 '-QtaNl9EpPA',
 'pASX4QE-N50',
 'k70W5W5UFjU',
 '7Dy_noSGxFU',
 'x0HmLl8MVnU',
 'WZZ0q-dC0Is',
 'qPDowzeIg3E',
 'Z0B-Cu6q_2w',
 'v1V_R5uAfB0',
 '4fPh-__KygE',
 'Aqh0pSy-6lg',
 'KJw0PT

### Obtencion estadísticas de vídeos

In [25]:
def get_video_details(youtube, video_ids):
        all_video_info = []

        for i in range(0, len(video_ids), 50):
                request = youtube.videos().list(
                        part="snippet,contentDetails,statistics",
                        id=','.join(video_ids[i:i+50])
                )
                response = request.execute()

                for video in response['items']:
                        stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                                        'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                                        'contentDetails': ['duration', 'definition', 'caption']
                                        }
                        video_info = {}
                        video_info['video_id'] = video['id']

                        for k in stats_to_keep.keys():
                                for v in stats_to_keep[k]:
                                        try:
                                                video_info[v] = video[k][v]
                                        except:
                                                video_info[v] = None
                        
                        all_video_info.append(video_info)

        return pd.DataFrame(all_video_info)

In [26]:
#with open('../data/youtube/yt_video_ids.txt', 'r') as file:
    #video_ids = file.read().splitlines()

video_df = get_video_details(youtube, video_ids)
video_df.to_csv('../data/youtube/yt_video_stats.csv', sep=';', index=False)
video_df.head(5)

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,MWn--jTEJ8k,Ibai,Mi primera chamba,MI CANAL DE DIRECTOS: https://www.youtube.com/...,"[ibai, ibai llanos, reaccionando Ibai, Ibai re...",2024-03-06T11:00:28Z,10477,1282,,15,PT28S,hd,False
1,-QtaNl9EpPA,Ibai,No me lo esperaba,MI CANAL DE DIRECTOS: https://www.youtube.com/...,"[ibai, ibai llanos, reaccionando Ibai, Ibai re...",2024-03-05T11:00:00Z,48223,4190,,21,PT13S,hd,False
2,pASX4QE-N50,Ibai,YOSOYPLEX VS EL MARIANA | CARA A CARA,MI CANAL DE DIRECTOS: https://www.youtube.com/...,"[ibai, ibai llanos, reaccionando Ibai, Ibai re...",2024-03-04T22:01:34Z,910452,49368,,2473,PT17M14S,hd,False
3,k70W5W5UFjU,Ibai,EL REY DE LA PISTA | CARA A CARA,MI CANAL DE DIRECTOS: https://www.youtube.com/...,"[ibai, ibai llanos, reaccionando Ibai, Ibai re...",2024-03-04T22:00:24Z,556351,25072,,1054,PT33M42S,hd,False
4,7Dy_noSGxFU,Ibai,VIRUZZ VS SHELAO | CARA A CARA,MI CANAL DE DIRECTOS: https://www.youtube.com/...,"[ibai, ibai llanos, reaccionando Ibai, Ibai re...",2024-03-04T21:17:46Z,970454,54327,,3385,PT25M15S,hd,False


### Comentarios

In [27]:
def get_comments_in_videos(youtube, video_ids, limit):
    all_comments = []
    
    for video_id in video_ids:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=limit
            )
            response = request.execute()
        
            comments_in_video = [
                {
                    'comment': comment['snippet']['topLevelComment']['snippet']['textOriginal'],
                    'date': comment['snippet']['topLevelComment']['snippet']['publishedAt']
                }
                for comment in response['items']
            ]
            
            for comment_info in comments_in_video:
                comment_info['video_id'] = video_id
                all_comments.append(comment_info)
        
        except Exception as e:
            print(f"Could not get comments for video {video_id}: {str(e)}")
    
    return pd.DataFrame(all_comments)

In [28]:
#with open('../data/youtube/yt_video_ids.txt', 'r') as file:
    #video_ids = file.read().splitlines()

comments_df = get_comments_in_videos(youtube, video_ids , limit=None)
comments_df.to_csv('../data/youtube/yt_video_comments.csv', sep=';', index=False)
comments_df

Unnamed: 0,comment,date,video_id
0,Mi primera chamba,2024-03-06T16:53:06Z,MWn--jTEJ8k
1,Ibai lleva a anuel o bad bunny a la velada y m...,2024-03-06T16:38:23Z,MWn--jTEJ8k
2,Qyeso,2024-03-06T16:14:51Z,MWn--jTEJ8k
3,El pepe,2024-03-06T16:04:08Z,MWn--jTEJ8k
4,que grande,2024-03-06T14:32:12Z,MWn--jTEJ8k
...,...,...,...
34867,2:28 4:34\n\nYo riéndome con video graciosos a...,2024-02-24T19:37:14Z,tBZbah0Y-I4
34868,"tenia que haber ganado el de""uaaaaaaaaaaaau, r...",2024-02-24T15:09:29Z,tBZbah0Y-I4
34869,7:01 Exploté de la risa 🤣,2024-02-24T13:17:36Z,tBZbah0Y-I4
34870,Hola,2024-02-24T03:25:05Z,tBZbah0Y-I4
