In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
import googleapiclient.discovery
import pandas as pd

In [2]:
# set parameters required to create a youtube service object
api_service_name = "youtube"
api_version = "v3"
API_KEY = os.environ.get("API_KEY")

# create a youtube api service object
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=API_KEY)

# Video Ids

In [3]:
def get_video_ids(youtube_service_object, channel_id: str, published_after, published_before, search_term: str = None):
    """Connects to the YouTube Data API using 'search' and returns video Ids for a specified request.

    
    Parameters
    ----------
    service_object : googleapiclient object
        a service object created using `googleapiclinet.discovery.build`

    channel_Id : str
        The id of the YouTube channel you want to search for videos.

    published_after : datetime
        an RFC 3339 formatted date-time value (1970-01-01T00:00:00Z).

    published_before : datetime
        an RFC 3339 formatted date-time value (1970-01-01T00:00:00Z).

    query : str
        A search term if you wish to narrow down the search using keywords. See Notes for 
        further information.



    Returns
    --------
    video_ids : list
        A list of the videoId values obtained from the request.

    
    Notes
    ------
    query : str
        Your request can also use the Boolean NOT (-) and OR (|) operators to exclude videos or 
        to find videos that are associated with one of several search terms. For example, to 
        search for videos matching either "boating" or "sailing", set the q parameter value to 
        boating|sailing. Similarly, to search for videos matching either "boating" or "sailing" 
        but not "fishing", set the q parameter value to boating|sailing -fishing. Note that the 
        pipe character must be URL-escaped when it is sent in your API request. The URL-escaped 
        value for the pipe character is %7C.


    References
    -----------
        https://developers.google.com/youtube/v3/docs/search
    
    """
    
    # make a request to the youtube api
    request = youtube_service_object.search().list(
        channelId=channel_id,
        publishedAfter=published_after,
        publishedBefore=published_before,
        q=search_term,
        part="snippet", 
        type="video",
        order="date",
        maxResults=50,
    )
    response = request.execute()
    
    video_ids = []

    # loop through response and store video Ids in a list
    for i, v in enumerate(range(len(response["items"]))):
        video_ids.append(response["items"][i]["id"]["videoId"])
    
    next_page_token = response.get("nextPageToken", None)
    more_pages = True
    
    while more_pages == True:
        if next_page_token is None:
            more_pages = False
    
        else: 
            # make a request to the youtube api to get the next page results
            request = youtube_service_object.search().list(
            channelId=channel_id,
            publishedAfter=published_after,
            publishedBefore=published_before,
            q=search_term,
            part="snippet", 
            type="video",
            order="date",
            maxResults=50,
            pageToken=next_page_token
            )
            response = request.execute()
    
            for i, v in enumerate(range(len(response["items"]))):
                video_ids.append(response["items"][i]["id"]["videoId"])
            
            next_page_token = response.get("nextPageToken", None)
    
    return video_ids

# Video data from a given video ID

In [4]:
def get_video_data(youtube_service_object, video_ids):
    """Retrieves statistics for a given YouTube video ID.

    Parameters
    ----------
    youtube_service_object : googleapiclient object
        a service object created using `googleapiclinet.discovery.build`
    
    video_ids : list or str
        A list of video IDs or a single string if only wanting to return data for one video ID.

    Returns
    --------
    df_ : dataframe
        A dataframe with the data for each video ID.
    
    """

    request = youtube_service_object.videos().list(
        part="snippet,statistics",
        maxResults=50,
        id=video_ids
    )
    
    response = request.execute()
    
    data_dict = {}
    
    # loop through response and store data about each video in a dictionary
    for i, v in enumerate(response["items"]):
        data_dict_ = {
            response["items"][i]["id"]: {"channelTitle": response["items"][i]["snippet"]["channelTitle"],
                                         "channelId": response["items"][i]["snippet"]["channelId"],
                                         "videoId": response["items"][i]["id"],
                                         "publishedAt": response["items"][i]["snippet"]["publishedAt"],
                                         "title": response["items"][i]["snippet"]["title"],
                                         "description": response["items"][i]["snippet"]["description"],
                                         "tags": response["items"][i]["snippet"]["tags"],
                                         "viewCount": response["items"][i]["statistics"]["viewCount"],
                                         "likeCount": response["items"][i]["statistics"]["likeCount"],
                                         "commentCount": response["items"][i]["statistics"]["commentCount"],
                                         "favoriteCount": response["items"][i]["statistics"]["favoriteCount"],
                                        }
        }
        
        # add the data to the data dictionary
        data_dict.update(data_dict_)
    
    next_page_token = response.get("nextPageToken", None)
    more_pages = True
    
    while more_pages == True:
        if next_page_token is None:
            more_pages = False
    
        else: 
            # make a request to the youtube api to get the next page results   
            request = youtube_service_object.videos().list(
                part="snippet,statistics",
                maxResults=50,
                id=video_ids,
                pageToken=next_page_token,
            )
            
            response = request.execute()
            
            # loop through response and store data about each video in a dictionary
            for i, v in enumerate(response["items"]):
                data_dict_ = {
                    response["items"][i]["id"]: {"channelTitle": response["items"][i]["snippet"]["channelTitle"],
                                                 "channelId": response["items"][i]["snippet"]["channelId"],
                                                 "videoId": response["items"][i]["id"],
                                                 "publishedAt": response["items"][i]["snippet"]["publishedAt"],
                                                 "title": response["items"][i]["snippet"]["title"],
                                                 "description": response["items"][i]["snippet"]["description"],
                                                 "tags": response["items"][i]["snippet"]["tags"],
                                                 "viewCount": response["items"][i]["statistics"]["viewCount"],
                                                 "likeCount": response["items"][i]["statistics"]["likeCount"],
                                                 "commentCount": response["items"][i]["statistics"]["commentCount"],
                                                 "favoriteCount": response["items"][i]["statistics"]["favoriteCount"],
                                                }
                }
                
                # add the data to the data dictionary
                data_dict.update(data_dict_)
            
            next_page_token = response.get("nextPageToken", None)

    
    # create dataframe from dictionary data
    df = (pd.DataFrame.from_dict(data_dict, orient="index")
          .rename_axis("videoId")
          .astype({"publishedAt": "datetime64[ns, UTC]", 
                           "viewCount": "int64", 
                           "likeCount": "int64", 
                           "commentCount": "int64", 
                           "favoriteCount": "int64"})
          .drop_duplicates(subset=['videoId'])
          .sort_values(by=["publishedAt"])
          .reset_index(drop=True)
         )

    # add a line that drops rows where 'tekken' isn't in the video title
    df = (df.loc[df['title'].str.lower().str.contains("tekken")]
          .reset_index(drop=True))


    return df

# Test functions

In [5]:
# set channel and search details
bandai_namco_america_id = "UC_ntXHv-XdKCD7CPynVvnQw"
published_after = "2023-06-01T00:00:00Z"
published_before = "2023-12-21T00:00:00Z"
search_term = "tekken"

In [6]:
# set parameters required to create a youtube service object
api_service_name = "youtube"
api_version = "v3"
API_KEY = os.environ.get("API_KEY")

# create a youtube service object
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=API_KEY)

## Get channel data

In [7]:
# make a request to the youtube api
request_channel_data = youtube.channels().list(
    part = "snippet,statistics,contentDetails", 
    # forUsername = "BandaiNamcoAmerica",
    id = "UC_ntXHv-XdKCD7CPynVvnQw"
)

response = request_channel_data.execute()
response

{'kind': 'youtube#channelListResponse',
 'etag': 't-ljBz4wibkJdndFyooMPL-Cgro',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'sE8FVbDqKHWy0boDbKZocBNeBKE',
   'id': 'UC_ntXHv-XdKCD7CPynVvnQw',
   'snippet': {'title': 'Bandai Namco Entertainment America',
    'description': 'Fun for all into the future! Tune in right here for new trailers and reveals for your favorite Bandai Namco games!\n\n\n',
    'customUrl': '@bandainamcoamerica',
    'publishedAt': '2006-09-19T17:48:06Z',
    'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/-K3RYA88iYssAO0kKa5DKazS_6mrkBIbgpnHpykrGeROqEIZ18QA7JmD6KPiaXwL_JXNp3hf=s88-c-k-c0x00ffffff-no-rj',
      'width': 88,
      'height': 88},
     'medium': {'url': 'https://yt3.ggpht.com/-K3RYA88iYssAO0kKa5DKazS_6mrkBIbgpnHpykrGeROqEIZ18QA7JmD6KPiaXwL_JXNp3hf=s240-c-k-c0x00ffffff-no-rj',
      'width': 240,
      'height': 240},
     'high': {'url': 'https://yt3.ggpht.com/-K3RYA88iYssAO0kKa5DKaz

In [8]:
response['items'][0]['id']

'UC_ntXHv-XdKCD7CPynVvnQw'

In [9]:
response['items'][0]['statistics']['subscriberCount']

'1100000'

## Get video Ids

In [10]:
video_ids = get_video_ids(youtube_service_object=youtube, 
                          channel_id=bandai_namco_america_id,
                         published_after=published_after,
                         published_before=published_before,
                         search_term=search_term)

In [11]:
video_ids

['UgnPG2bScVQ',
 '9jJiNa4HoD0',
 'X1dgCe1jDYg',
 'EMZkmjE8wdw',
 'ToKJfywbe1o',
 '9D5vq-zq9y4',
 'y8JGUIF2pu4',
 'oeFfzCWif-Q',
 'bSCANspTDeE',
 'ucesGynb2Yk',
 'UcBcNOSoFzI',
 '8DVlK_QrZ-A',
 'Zc-yMi05vBA',
 'e1N4juHVqNc',
 'QH6s_o3dIic',
 '7skTtnpSb58',
 'bjzYbEjE-C4',
 'weVrUBszFIM',
 'Gw5nQaSF0CI',
 '3pGxqOFmIN4',
 'w0IqzD-gUOI',
 'PsCpewoF2E4',
 'cHnxJplTQuY',
 'qbUnCiTMCGE',
 'cIDK50IaVpg',
 'rDxrpSqYHD8',
 'YFJfLsJtVzM',
 'e7mqbmNb6eA',
 '_q26pgYDOV4',
 'n27RxZ7vnAU',
 'lDgv7CMIoRo',
 'nA-QTZbm_hU',
 'KHSwxMDibvE',
 'sNHv1y46dhs',
 'yowAmloydVY',
 '6hxZCQtpJ9w',
 '-l3AY19cn0M',
 'MxGp3wHXtNE',
 '4UkK_psUEVM',
 'maawo7O9Sg8',
 'Qkoba4YhbGo',
 'N5ZlZSnNyo0',
 'hEQTFXMQU7I',
 'flXHAFNT4sU',
 'RFO9Z_0wrKE',
 'kpmOhBWlDfc',
 '0Tk5YA4WRrg',
 'bMbDVh_OKZg',
 'UPPKjJgQT4A',
 'iILoqJlCa2s',
 'ODforBeu7_c',
 'DcDjyestr4Y',
 'Vs2piSWfofQ',
 '9Xgj0KYypVw',
 'LYdPz_GD8OQ',
 '6JLOD_a7or4',
 'Nh5udl01YXc']

In [12]:
len(video_ids)

57

## Get video data

In [13]:
video_ids_test = ['UgnPG2bScVQ', '9jJiNa4HoD0','EMZkmjE8wdw','ToKJfywbe1o']
df = get_video_data(youtube_service_object=youtube, video_ids=video_ids_test)
df.head()

Unnamed: 0,channelTitle,channelId,videoId,publishedAt,title,description,tags,viewCount,likeCount,commentCount,favoriteCount
0,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,ToKJfywbe1o,2023-12-14 14:10:00+00:00,TEKKEN 8 – Official Story Trailer,Witness true power. Experience the Mishima sag...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",1299243,43259,4012,0
1,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,9jJiNa4HoD0,2023-12-19 14:00:08+00:00,TEKKEN 8 – Ultimate Edition Trailer,Ultimate style. Experience everything TEKKEN 8...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",364914,9367,1212,0
2,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,UgnPG2bScVQ,2023-12-20 14:00:19+00:00,TEKKEN 8 – Shaheen Reveal & Gameplay Trailer,The Desert Falcon is ready to ruffle some feat...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",578721,23796,3666,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   channelTitle   3 non-null      object             
 1   channelId      3 non-null      object             
 2   videoId        3 non-null      object             
 3   publishedAt    3 non-null      datetime64[ns, UTC]
 4   title          3 non-null      object             
 5   description    3 non-null      object             
 6   tags           3 non-null      object             
 7   viewCount      3 non-null      int64              
 8   likeCount      3 non-null      int64              
 9   commentCount   3 non-null      int64              
 10  favoriteCount  3 non-null      int64              
dtypes: datetime64[ns, UTC](1), int64(4), object(6)
memory usage: 396.0+ bytes


In [None]:
# export the video data to a csv
df.to_csv("../data/raw/video_data.csv", index=False)