In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
from dotenv import load_dotenv
load_dotenv()
import googleapiclient.discovery
import pandas as pd
from src.api.get_youtube_data import get_video_ids
from tqdm import tqdm

In [2]:
# set parameters required to create a youtube service object
api_service_name = "youtube"
api_version = "v3"
API_KEY = os.environ.get("API_KEY")

# create a youtube api service object
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=API_KEY)

In [3]:
# set channel and search details
bandai_namco_america_id = "UC_ntXHv-XdKCD7CPynVvnQw"
published_after = "2023-06-01T00:00:00Z"
published_before = "2023-12-28T00:00:00Z"
search_term = "tekken"

In [4]:
# get video ids
video_ids = get_video_ids(youtube_service_object=youtube, 
                          channel_id=bandai_namco_america_id,
                         published_after=published_after,
                         published_before=published_before,
                         search_term=search_term)

In [5]:
video_ids

['6m7jNzjuoHU',
 'UgnPG2bScVQ',
 '9jJiNa4HoD0',
 'X1dgCe1jDYg',
 'EMZkmjE8wdw',
 'ToKJfywbe1o',
 '9D5vq-zq9y4',
 'y8JGUIF2pu4',
 'oeFfzCWif-Q',
 'bSCANspTDeE',
 'ucesGynb2Yk',
 'UcBcNOSoFzI',
 '8DVlK_QrZ-A',
 'Zc-yMi05vBA',
 'e1N4juHVqNc',
 'QH6s_o3dIic',
 '7skTtnpSb58',
 'bjzYbEjE-C4',
 'weVrUBszFIM',
 'Gw5nQaSF0CI',
 '3pGxqOFmIN4',
 'w0IqzD-gUOI',
 'PsCpewoF2E4',
 'cHnxJplTQuY',
 'qbUnCiTMCGE',
 'cIDK50IaVpg',
 'rDxrpSqYHD8',
 'YFJfLsJtVzM',
 'e7mqbmNb6eA',
 '_q26pgYDOV4',
 'n27RxZ7vnAU',
 'lDgv7CMIoRo',
 'nA-QTZbm_hU',
 'KHSwxMDibvE',
 'sNHv1y46dhs',
 'yowAmloydVY',
 '6hxZCQtpJ9w',
 '-l3AY19cn0M',
 'MxGp3wHXtNE',
 '4UkK_psUEVM',
 'maawo7O9Sg8',
 'Qkoba4YhbGo',
 'N5ZlZSnNyo0',
 'hEQTFXMQU7I',
 'flXHAFNT4sU',
 'RFO9Z_0wrKE',
 'kpmOhBWlDfc',
 '0Tk5YA4WRrg',
 'bMbDVh_OKZg',
 'UPPKjJgQT4A',
 'iILoqJlCa2s',
 'ODforBeu7_c',
 'DcDjyestr4Y',
 'Vs2piSWfofQ',
 '9Xgj0KYypVw',
 'LYdPz_GD8OQ',
 '6JLOD_a7or4',
 'Nh5udl01YXc']

In [6]:
len(video_ids)

58

# Video data from a given video ID

The video data function returns an error when the video id list is greater than 50 so I need to create a function that splits the list into batches of 50 and processes the video ids 50 at a time...

In [25]:
def get_video_data(youtube_service_object, video_ids):
    """Retrieves statistics for a given YouTube video ID and creates a dataframe with data for the
    videos that contain "tekken" in the title.

    Parameters
    ----------
    youtube_service_object : googleapiclient object
        a service object created using `googleapiclinet.discovery.build`
    
    video_ids : list or str
        A list of video IDs or a single string if only wanting to return data for one video ID.

    Returns
    --------
    df : dataframe
        A dataframe with the data for video Ids whose title contains the word "tekken".
    
    """

    all_data_dict = {}
    
    # split video_ids list into batches of 50 and process each batch using helper function
    for index, batch_start in tqdm(enumerate(range(0, len(video_ids), 50))):
        print(f"Batch {index+1} start: {batch_start}")
        batch_end = min(batch_start + 50, len(video_ids))
        print(f"Batch {index+1} end: {batch_end}")
        batch_ids = video_ids[batch_start:batch_end]

        # get data for the batch of <=50 video ids using helper function
        batch_data = _get_video_data_for_batch(youtube_service_object, batch_ids)
        all_data_dict.update(batch_data)


    # create dataframe from dictionary data
    df = (pd.DataFrame.from_dict(all_data_dict, orient="index")
          .rename_axis("videoId")
          .astype({"publishedAt": "datetime64[ns, UTC]", 
                           "viewCount": "int64", 
                           "likeCount": "int64", 
                           "commentCount": "int64", 
                           "favoriteCount": "int64"})
          .drop_duplicates(subset=['videoId'])
          .sort_values(by=["publishedAt"])
          .reset_index(drop=True)
         )

    # add a line that drops rows where 'tekken' isn't in the video title
    df = (df.loc[df['title'].str.lower().str.contains("tekken")]
          .reset_index(drop=True))


    return df



In [26]:
def _get_video_data_for_batch(youtube_service_object, video_ids):
    """Helper function that retrieves statistics for a given YouTube video ID, in batches 
    of 50 video ids at a time.

    Parameters
    ----------
    youtube_service_object : googleapiclient object
        a service object created using `googleapiclinet.discovery.build`
    
    video_ids : list or str
        A list of video IDs or a single string if only wanting to return data for one video ID.

    Returns
    -------
    data_dict : dictionary
        A dictionary with the data for the passed video IDs.
    
    """
 
    request = youtube_service_object.videos().list(
        part="snippet,statistics",
        maxResults=50,
        id=video_ids
    )
    
    response = request.execute()
    
    data_dict = {}
    
    # loop through response and store data about each video in a dictionary
    for i, v in enumerate(response["items"]):
        data_dict_ = {
            response["items"][i]["id"]: {"channelTitle": response["items"][i]["snippet"]["channelTitle"],
                                         "channelId": response["items"][i]["snippet"]["channelId"],
                                         "videoId": response["items"][i]["id"],
                                         "publishedAt": response["items"][i]["snippet"]["publishedAt"],
                                         "title": response["items"][i]["snippet"]["title"],
                                         "description": response["items"][i]["snippet"]["description"],
                                         "tags": response["items"][0]["snippet"].get("tags"),   # use .get() so none is returned if 'tags' isn't present (video doesn't have tags)
                                         "viewCount": response["items"][i]["statistics"]["viewCount"],
                                         "likeCount": response["items"][i]["statistics"]["likeCount"],
                                         "commentCount": response["items"][i]["statistics"]["commentCount"],
                                         "favoriteCount": response["items"][i]["statistics"]["favoriteCount"],
                                        }
        }
        
        # add the data to the data dictionary
        data_dict.update(data_dict_)
    
    next_page_token = response.get("nextPageToken", None)
    more_pages = True
    
    while more_pages == True:
        if next_page_token is None:
            more_pages = False
    
        else: 
            # make a request to the youtube api to get the next page results   
            request = youtube_service_object.videos().list(
                part="snippet,statistics",
                maxResults=50,
                id=video_ids,
                pageToken=next_page_token,
            )
            
            response = request.execute()
                
            # loop through response and store data about each video in a dictionary
            for i, v in enumerate(response["items"]):
                data_dict_ = {
                    response["items"][i]["id"]: {"channelTitle": response["items"][i]["snippet"]["channelTitle"],
                                                 "channelId": response["items"][i]["snippet"]["channelId"],
                                                 "videoId": response["items"][i]["id"],
                                                 "publishedAt": response["items"][i]["snippet"]["publishedAt"],
                                                 "title": response["items"][i]["snippet"]["title"],
                                                 "description": response["items"][i]["snippet"]["description"],
                                                 "tags": response["items"][i]["snippet"].get("tags"),   # use .get() so none is returned if 'tags' isn't present (video doesn't have tags)
                                                 "viewCount": response["items"][i]["statistics"]["viewCount"],
                                                 "likeCount": response["items"][i]["statistics"]["likeCount"],
                                                 "commentCount": response["items"][i]["statistics"]["commentCount"],
                                                 "favoriteCount": response["items"][i]["statistics"]["favoriteCount"],
                                                }
                }
                
                # add the data to the data dictionary
                data_dict.update(data_dict_)
        
            next_page_token = response.get("nextPageToken", None)

    
    return data_dict

# Test function

In [27]:
df = get_video_data(youtube_service_object=youtube, video_ids=video_ids)
df.head()

2it [00:00, 12.32it/s]

Batch 1 start: 0
Batch 1 end: 50
Batch 2 start: 50
Batch 2 end: 58





Unnamed: 0,channelTitle,channelId,videoId,publishedAt,title,description,tags,viewCount,likeCount,commentCount,favoriteCount
0,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,Nh5udl01YXc,2023-06-09 13:05:54+00:00,TEKKEN 8 — Closed Network Test Announcement Tr...,It's time to get this show on the road 🥊\n16 c...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",635713,18876,2186,0
1,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,LYdPz_GD8OQ,2023-07-19 14:00:16+00:00,TEKKEN Talk — Episode 2,We're back with more #TEKKENTalk! Hosted by su...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",99885,3115,617,0
2,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,9Xgj0KYypVw,2023-07-20 13:00:13+00:00,TEKKEN 8 — Claudio Serafino Reveal & Gameplay ...,It's time to get Sirius ✨ Claudio Serafino wil...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",770950,29137,3681,0
3,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,ODforBeu7_c,2023-08-05 22:22:04+00:00,TEKKEN Talk Live @ Evo 2023!,Join us for a live episode of #TEKKENTalk at E...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",96718,1874,135,0
4,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,UPPKjJgQT4A,2023-08-06 20:02:43+00:00,TEKKEN 8 — Raven Reveal & Gameplay Trailer,Now you see him. Now you don't. Raven returns ...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",938486,33705,4119,0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   channelTitle   22 non-null     object             
 1   channelId      22 non-null     object             
 2   videoId        22 non-null     object             
 3   publishedAt    22 non-null     datetime64[ns, UTC]
 4   title          22 non-null     object             
 5   description    22 non-null     object             
 6   tags           22 non-null     object             
 7   viewCount      22 non-null     int64              
 8   likeCount      22 non-null     int64              
 9   commentCount   22 non-null     int64              
 10  favoriteCount  22 non-null     int64              
dtypes: datetime64[ns, UTC](1), int64(4), object(6)
memory usage: 2.0+ KB


In [29]:
df['videoId'].nunique()

22

In [31]:
# export the video data to a csv
df.to_csv("data/raw/video_data.csv", index=False)

----------------------
----------------------

In [4]:
def get_video_data(youtube_service_object, video_ids):
    """Retrieves statistics for a given YouTube video ID.

    Parameters
    ----------
    youtube_service_object : googleapiclient object
        a service object created using `googleapiclinet.discovery.build`
    
    video_ids : list or str
        A list of video IDs or a single string if only wanting to return data for one video ID.

    Returns
    --------
    df_ : dataframe
        A dataframe with the data for each video ID.
    
    """

    request = youtube_service_object.videos().list(
        part="snippet,statistics",
        maxResults=50,
        id=video_ids
    )
    
    response = request.execute()
    
    data_dict = {}
    
    # loop through response and store data about each video in a dictionary
    for i, v in enumerate(response["items"]):
        data_dict_ = {
            response["items"][i]["id"]: {"channelTitle": response["items"][i]["snippet"]["channelTitle"],
                                         "channelId": response["items"][i]["snippet"]["channelId"],
                                         "videoId": response["items"][i]["id"],
                                         "publishedAt": response["items"][i]["snippet"]["publishedAt"],
                                         "title": response["items"][i]["snippet"]["title"],
                                         "description": response["items"][i]["snippet"]["description"],
                                         "tags": response["items"][0]["snippet"].get("tags"),   # use .get() so none is returned if 'tags' isn't present (video doesn't have tags)
                                         "viewCount": response["items"][i]["statistics"]["viewCount"],
                                         "likeCount": response["items"][i]["statistics"]["likeCount"],
                                         "commentCount": response["items"][i]["statistics"]["commentCount"],
                                         "favoriteCount": response["items"][i]["statistics"]["favoriteCount"],
                                        }
        }
        
        # add the data to the data dictionary
        data_dict.update(data_dict_)
    
    next_page_token = response.get("nextPageToken", None)
    more_pages = True
    
    while more_pages == True:
        if next_page_token is None:
            more_pages = False
    
        else: 
            # make a request to the youtube api to get the next page results   
            request = youtube_service_object.videos().list(
                part="snippet,statistics",
                maxResults=50,
                id=video_ids,
                pageToken=next_page_token,
            )
            
            response = request.execute()
                
            # loop through response and store data about each video in a dictionary
            for i, v in enumerate(response["items"]):
                data_dict_ = {
                    response["items"][i]["id"]: {"channelTitle": response["items"][i]["snippet"]["channelTitle"],
                                                 "channelId": response["items"][i]["snippet"]["channelId"],
                                                 "videoId": response["items"][i]["id"],
                                                 "publishedAt": response["items"][i]["snippet"]["publishedAt"],
                                                 "title": response["items"][i]["snippet"]["title"],
                                                 "description": response["items"][i]["snippet"]["description"],
                                                 "tags": response["items"][i]["snippet"].get("tags"),   # use .get() so none is returned if 'tags' isn't present (video doesn't have tags)
                                                 "viewCount": response["items"][i]["statistics"]["viewCount"],
                                                 "likeCount": response["items"][i]["statistics"]["likeCount"],
                                                 "commentCount": response["items"][i]["statistics"]["commentCount"],
                                                 "favoriteCount": response["items"][i]["statistics"]["favoriteCount"],
                                                }
                }
                
                # add the data to the data dictionary
                data_dict.update(data_dict_)
        
            next_page_token = response.get("nextPageToken", None)

    
    # create dataframe from dictionary data
    df = (pd.DataFrame.from_dict(data_dict, orient="index")
          .rename_axis("videoId")
          .astype({"publishedAt": "datetime64[ns, UTC]", 
                           "viewCount": "int64", 
                           "likeCount": "int64", 
                           "commentCount": "int64", 
                           "favoriteCount": "int64"})
          .drop_duplicates(subset=['videoId'])
          .sort_values(by=["publishedAt"])
          .reset_index(drop=True)
         )

    # add a line that drops rows where 'tekken' isn't in the video title
    df = (df.loc[df['title'].str.lower().str.contains("tekken")]
          .reset_index(drop=True))


    return df