In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
from dotenv import load_dotenv
load_dotenv()

import googleapiclient.discovery
import pandas as pd
# pd.set_option('display.max_colwidth', None)
from src.api.get_youtube_data import get_video_ids, get_video_data, get_top_level_comments
from tqdm.notebook import tqdm
import time

# Create YouTube service object for connecting to API

In [2]:
# set parameters for creating a youtube service object
api_service_name = "youtube"
api_version = "v3"
API_KEY = os.environ.get("API_KEY")

# create a youtube service object
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=API_KEY)

# Get video IDs and video data
**NOTE: The YouTube API sets limits on the number of requests allowed per day; when running this notebook be sure that your request is within the allowed limits to avoid incurring costs.**
- I want to get the video IDs and video data for videos published in November and December.
- The video IDs retrieved will be video IDs published by a given channel that have 'tekken' in the description, tag, or title; when retrieving the data for each video we will only retrieve data for videos that include 'tekken' in the title.

In [20]:
# set channel and search details
bandai_namco_america_id = "UC_ntXHv-XdKCD7CPynVvnQw"
published_after = "2023-11-01T00:00:00Z"
published_before = "2023-12-21T00:00:00Z"
search_term = "tekken"

In [21]:
# get video Ids so we can pass them to the get_video_data function
video_ids = get_video_ids(youtube_service_object=youtube, 
                          channel_id=bandai_namco_america_id,
                          published_before=published_before,
                          published_after=published_after, 
                         search_term=search_term)

In [22]:
print(f"Number of video IDs retrived: {len(video_ids)}")
print(f"Video IDs:\n{video_ids}")

Number of video IDs retrived: 26
Video IDs:
['UgnPG2bScVQ', 'ta_lOrG8zrQ', '9jJiNa4HoD0', 'X1dgCe1jDYg', 'EMZkmjE8wdw', 'ToKJfywbe1o', '9D5vq-zq9y4', 'y8JGUIF2pu4', 'oeFfzCWif-Q', 'bSCANspTDeE', 'ucesGynb2Yk', 'UcBcNOSoFzI', '8DVlK_QrZ-A', 'Zc-yMi05vBA', 'e1N4juHVqNc', 'QH6s_o3dIic', '7skTtnpSb58', 'bjzYbEjE-C4', 'Gw5nQaSF0CI', '3pGxqOFmIN4', 'w0IqzD-gUOI', 'PsCpewoF2E4', 'cHnxJplTQuY', 'qbUnCiTMCGE', 'cIDK50IaVpg', 'rDxrpSqYHD8']


In [23]:
# retrieve data about each video
df = get_video_data(youtube_service_object=youtube, video_ids=video_ids)
df.head(3)

0it [00:00, ?it/s]

Batch 1 start: 0
Batch 1 end: 26


Unnamed: 0,channelTitle,channelId,videoId,publishedAt,title,description,tags,viewCount,likeCount,commentCount,favoriteCount
0,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,rDxrpSqYHD8,2023-11-01 16:09:18+00:00,TEKKEN 8 – THE RETURN OF LEGENDS - NEW CHARACTERS REVEAL TRAILER,"Five legends return in #TEKKEN8 for the next King of Iron Fist tournament!\n\nAnd we're not done yet. 👊 http://tekken.com\n\nTake a look at the latest fighters newly confirmed as playable in Tekken 8: Devil Jin, Zafina, Alisa Bosconovich, and Lee Chaolan (with a bonus appearance by Panda). The final characters will be revealed on November 2 and then November 12.","[Bandai Namco, Bandai Namco Entertainment, Video, Games, video games, namco bandai, United States, PS5, PS4, Xbox Series X]",913541,25048,2770,0
1,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,cIDK50IaVpg,2023-11-02 13:22:49+00:00,TEKKEN 8 – Victor Chevalier Reveal & Gameplay Trailer,"With him, violence is à la carte.\nVictor Chevalier, voiced by Vincent Cassel, slices his way into the #TEKKEN8 roster!\n\nPre-order TEKKEN 8 👉 https://bnent.eu/Preorder-TEKKEN8\nJoin the official TEKKEN server 👉 https://bnent.eu/TekkenDiscord\n\nFollow Bandai Namco Entertainment: \nInstagram: https://www.instagram.com/bandainamcous\nFacebook: https://www.facebook.com/BandaiNamcoUS\nTwitter: https://twitter.com/BandaiNamcoUS\nTwitch: http://www.twitch.tv/bandainamcous","[Bandai Namco, Bandai Namco Entertainment, Video, Games, video games, namco bandai, United States, PS5, PS4, Xbox Series X]",1500052,44643,7408,0
2,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,PsCpewoF2E4,2023-11-13 05:03:26+00:00,TEKKEN 8 — Reina Reveal & Gameplay Trailer,It's time for them to learn their place.\nReina storms the King of Iron Fist Tournament in #TEKKEN8\n\nPre-order TEKKEN 8 👉 https://bnent.eu/Preorder-TEKKEN8\nJoin the official TEKKEN server 👉 https://bnent.eu/TekkenDiscord\n\nFollow Tekken:\nInstagram: https://www.instagram.com/tekken\nTwitter: https://twitter.com/Twitter\nFacebook: https://www.facebook.com/TekkenAmericas\n\nFollow Bandai Namco Entertainment: \nInstagram: https://www.instagram.com/bandainamcous\nFacebook: https://www.facebook.com/BandaiNamcoUS\nTwitter: https://twitter.com/BandaiNamcoUS\nTwitch: http://www.twitch.tv/bandainamcous,"[Bandai Namco, Bandai Namco Entertainment, Video, Games, video games, namco bandai, United States, PS5, PS4, Xbox Series X]",2280289,62768,7859,0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   channelTitle   11 non-null     object             
 1   channelId      11 non-null     object             
 2   videoId        11 non-null     object             
 3   publishedAt    11 non-null     datetime64[ns, UTC]
 4   title          11 non-null     object             
 5   description    11 non-null     object             
 6   tags           11 non-null     object             
 7   viewCount      11 non-null     int64              
 8   likeCount      11 non-null     int64              
 9   commentCount   11 non-null     int64              
 10  favoriteCount  11 non-null     int64              
dtypes: datetime64[ns, UTC](1), int64(4), object(6)
memory usage: 1.1+ KB


In [26]:
# investigate the titles of the videos so we can find the one we're looking for
df.loc[:][["title", "commentCount", "videoId"]]

Unnamed: 0,title,commentCount,videoId
0,TEKKEN 8 – THE RETURN OF LEGENDS - NEW CHARACTERS REVEAL TRAILER,2770,rDxrpSqYHD8
1,TEKKEN 8 – Victor Chevalier Reveal & Gameplay Trailer,7408,cIDK50IaVpg
2,TEKKEN 8 — Reina Reveal & Gameplay Trailer,7859,PsCpewoF2E4
3,TEKKEN 8 — Leo Reveal & Gameplay Trailer,5135,QH6s_o3dIic
4,TEKKEN 8 – Steve Fox Reveal & Gameplay Trailer,4319,Zc-yMi05vBA
5,TEKKEN 8 — Dragunov Reveal & Gameplay Trailer,3348,ucesGynb2Yk
6,TEKKEN 8 — Yoshimitsu Reveal & Gameplay Trailer,5021,y8JGUIF2pu4
7,TEKKEN 8 – Exclusive Story Demo Showcase,362,9D5vq-zq9y4
8,TEKKEN 8 – Official Story Trailer,4245,ToKJfywbe1o
9,TEKKEN 8 – Ultimate Edition Trailer,1217,9jJiNa4HoD0


I want to do some topic modeling on the 'new characters reveal trailer', so I'll grab the video ID.

In [36]:
# extract the video id for the 'new characters reveal trailer' video
new_characters_revealed_id = df.loc[df["title"].str.lower().str.contains("the return of legends - new characters reveal trailer")]['videoId'][0]
new_characters_revealed_id

'rDxrpSqYHD8'

# Get comments for the video

In [37]:
df_new_character_reveal_comments = get_top_level_comments(youtube_service_object=youtube, video_ids=new_characters_revealed_id)
df_new_character_reveal_comments.head(3)

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow


In [38]:
df_new_character_reveal_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2043 entries, 0 to 2042
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   videoId            2043 non-null   object             
 1   authorDisplayName  2043 non-null   object             
 2   publishedAt        2043 non-null   datetime64[ns, UTC]
 3   updatedAt          2043 non-null   datetime64[ns, UTC]
 4   likeCount          2043 non-null   int64              
 5   totalReplyCount    2043 non-null   int64              
 6   textDisplay        2043 non-null   object             
dtypes: datetime64[ns, UTC](2), int64(2), object(3)
memory usage: 111.9+ KB


# Export to csv

In [39]:
df_new_character_reveal_comments.to_csv("data/raw/new_character_reveal_comments.csv", index=False)