In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
from dotenv import load_dotenv
load_dotenv()

import googleapiclient.discovery
import pandas as pd
from src.api.get_youtube_data import get_video_ids, get_video_data
from tqdm.notebook import tqdm
import time

In [2]:
# set parameters for creating a youtube service object
api_service_name = "youtube"
api_version = "v3"
API_KEY = os.environ.get("API_KEY")

# create a youtube service object
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=API_KEY)

In [3]:
# set channel and search details
bandai_namco_america_id = "UC_ntXHv-XdKCD7CPynVvnQw"
published_after = "2023-11-01T00:00:00Z"
published_before = "2023-12-21T00:00:00Z"
search_term = "tekken"

- The `get_video_comments` function will take in video IDs as input so we need to gather some video ids with `get_video_ids`
- Use `get_video_data` to create a dataframe with only the video IDs and data for videos that contain "Tekken" in the title, as we want to extract comments about the upcoming release of Tekken 8.

In [4]:
# get video Ids
video_ids = get_video_ids(youtube_service_object=youtube, 
                          channel_id=bandai_namco_america_id,
                         published_before=published_before,
                         published_after=published_after,
                         search_term=search_term)

In [5]:
len(video_ids)

26

In [6]:
df = get_video_data(youtube_service_object=youtube, video_ids=video_ids)
df.head()

0it [00:00, ?it/s]

Batch 1 start: 0
Batch 1 end: 26


Unnamed: 0,channelTitle,channelId,videoId,publishedAt,title,description,tags,viewCount,likeCount,commentCount,favoriteCount
0,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,rDxrpSqYHD8,2023-11-01 16:09:18+00:00,TEKKEN 8 – THE RETURN OF LEGENDS - NEW CHARACT...,Five legends return in #TEKKEN8 for the next K...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",867742,24594,2809,0
1,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,cIDK50IaVpg,2023-11-02 13:22:49+00:00,TEKKEN 8 – Victor Chevalier Reveal & Gameplay ...,"With him, violence is à la carte.\nVictor Chev...","[Bandai Namco, Bandai Namco Entertainment, Vid...",1323861,42404,7289,0
2,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,PsCpewoF2E4,2023-11-13 05:03:26+00:00,TEKKEN 8 — Reina Reveal & Gameplay Trailer,It's time for them to learn their place.\nRein...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",1889054,57275,7538,0
3,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,QH6s_o3dIic,2023-11-22 14:00:30+00:00,TEKKEN 8 — Leo Reveal & Gameplay Trailer,It's time to punch the truth out of them. Leo ...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",822329,32381,5153,0
4,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,Zc-yMi05vBA,2023-11-29 14:00:33+00:00,TEKKEN 8 – Steve Fox Reveal & Gameplay Trailer,Time to knock out the competition🥊\nSteve will...,"[Bandai Namco, Bandai Namco Entertainment, Vid...",915840,37277,4430,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   channelTitle   11 non-null     object             
 1   channelId      11 non-null     object             
 2   videoId        11 non-null     object             
 3   publishedAt    11 non-null     datetime64[ns, UTC]
 4   title          11 non-null     object             
 5   description    11 non-null     object             
 6   tags           11 non-null     object             
 7   viewCount      11 non-null     int64              
 8   likeCount      11 non-null     int64              
 9   commentCount   11 non-null     int64              
 10  favoriteCount  11 non-null     int64              
dtypes: datetime64[ns, UTC](1), int64(4), object(6)
memory usage: 1.1+ KB


In [9]:
pd.set_option('display.max_colwidth', None)
df.loc[:][["title", "commentCount", "videoId"]]

Unnamed: 0,title,commentCount,videoId
0,TEKKEN 8 – THE RETURN OF LEGENDS - NEW CHARACTERS REVEAL TRAILER,2809,rDxrpSqYHD8
1,TEKKEN 8 – Victor Chevalier Reveal & Gameplay Trailer,7289,cIDK50IaVpg
2,TEKKEN 8 — Reina Reveal & Gameplay Trailer,7538,PsCpewoF2E4
3,TEKKEN 8 — Leo Reveal & Gameplay Trailer,5153,QH6s_o3dIic
4,TEKKEN 8 – Steve Fox Reveal & Gameplay Trailer,4430,Zc-yMi05vBA
5,TEKKEN 8 — Dragunov Reveal & Gameplay Trailer,3290,ucesGynb2Yk
6,TEKKEN 8 — Yoshimitsu Reveal & Gameplay Trailer,4815,y8JGUIF2pu4
7,TEKKEN 8 Exclusive Story Demo Showcase,361,9D5vq-zq9y4
8,TEKKEN 8 – Official Story Trailer,4038,ToKJfywbe1o
9,TEKKEN 8 – Ultimate Edition Trailer,1219,9jJiNa4HoD0


I want to perform NLP work on the '_Ultimate Edition Trailer_' video, which has 1,218 comments..

# Get comments

### Access the video ID I want comments from

In [10]:
df.loc[df["title"].str.lower().str.contains("tekken") & 
df["title"].str.lower().str.contains("ultimate edition")][["title", "videoId"]]

Unnamed: 0,title,videoId
9,TEKKEN 8 – Ultimate Edition Trailer,9jJiNa4HoD0


In [11]:
# filter dataframe to get only the row with the ultimate edition trailer video
df_filtered = df.loc[df["title"].str.lower().str.contains("tekken") & 
df["title"].str.lower().str.contains("ultimate edition")]["videoId"]

# store the video id in a variable
ultimate_edition_trailer_id = df_filtered.iloc[0]
ultimate_edition_trailer_id

'9jJiNa4HoD0'

Start with getting the comments when I pass one video id to the request...

In [25]:
# make a request for one video id
request = youtube.commentThreads().list(
    part="snippet",
    videoId="9jJiNa4HoD0",  
    order="time",
    maxResults=50
)

response = request.execute()

comments = []

for item in response['items']:
    comment = item['snippet']['topLevelComment']['snippet']
    comments.append([
        comment['videoId'],
        comment['authorDisplayName'],
        comment['publishedAt'],
        comment['updatedAt'],
        comment['likeCount'],
        item['snippet']['totalReplyCount'], # reply count not stored in same section of json response as all the others        
        comment['textDisplay']
    ])

next_page_token = response.get("nextPageToken", None)
more_pages = True
    
while more_pages == True:
    if next_page_token is None:
        more_pages = False

    else: 
        # make a request to the youtube api to get the next page results   
        request = youtube.commentThreads().list(
            part="snippet",
            videoId='9jJiNa4HoD0',
            order="time",
            maxResults=50,
            pageToken=next_page_token,
        )
        
        response = request.execute()

        # loop through response and store data about each video in a dictionary
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append([
                comment['videoId'],
                comment['authorDisplayName'],
                comment['publishedAt'],
                comment['updatedAt'],
                comment['likeCount'],
                item['snippet']['totalReplyCount'], # reply count not stored in same section of json response as all the others        
                comment['textDisplay']
            ])

        
        next_page_token = response.get("nextPageToken", None)



df_one_video = (pd.DataFrame(data=comments, columns=['videoId', 'authorDisplayName', 'publishedAt', 'updatedAt', 
                                          'likeCount', 'totalReplyCount', 'textDisplay'])
                .astype({"publishedAt": "datetime64[ns, UTC]", 
                         "updatedAt": "datetime64[ns, UTC]", 
                         "likeCount": "int64", 
                         "totalReplyCount": "int64"})
                .drop_duplicates(subset=["textDisplay"])
                .sort_values(by=["publishedAt"])
                .reset_index(drop=True)
               )


df_one_video.head(5)

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,9jJiNa4HoD0,@aladdinleader1116,2023-12-19 14:00:38+00:00,2023-12-19 14:00:38+00:00,1,0,Wow🎉🎉🎉🎉🎉❤
1,9jJiNa4HoD0,@Awoken777,2023-12-19 14:01:03+00:00,2023-12-19 14:01:03+00:00,611,81,Tekken is literally the only Game i would buy an ultimate edition for
2,9jJiNa4HoD0,@gagagag7051,2023-12-19 14:01:21+00:00,2023-12-19 14:01:21+00:00,0,0,First
3,9jJiNa4HoD0,@UmoralCaratol,2023-12-19 14:01:23+00:00,2023-12-19 14:01:23+00:00,1,0,Clowning
4,9jJiNa4HoD0,@KodaiVibes,2023-12-19 14:01:26+00:00,2023-12-19 14:01:26+00:00,0,0,"UGH I WANT IT SOOOO BADDD, I NEED TO FIND A NEW JOB QUICK"


In [26]:
response

{'kind': 'youtube#commentThreadListResponse',
 'etag': '2jmBff-VdzS5GIF2IdMu4hRR1FA',
 'pageInfo': {'totalResults': 20, 'resultsPerPage': 50},
 'items': [{'kind': 'youtube#commentThread',
   'etag': '4flwgWNqi8T6pPaDgQiwFIc-_Yw',
   'id': 'Ugyd8jlTPlSrq939sfl4AaABAg',
   'snippet': {'channelId': 'UC_ntXHv-XdKCD7CPynVvnQw',
    'videoId': '9jJiNa4HoD0',
    'topLevelComment': {'kind': 'youtube#comment',
     'etag': 'zOHCs7REjITCmO-2lYfRN3B_1Kg',
     'id': 'Ugyd8jlTPlSrq939sfl4AaABAg',
     'snippet': {'channelId': 'UC_ntXHv-XdKCD7CPynVvnQw',
      'videoId': '9jJiNa4HoD0',
      'textDisplay': 'this is the only reason to buy a ps5 for me',
      'textOriginal': 'this is the only reason to buy a ps5 for me',
      'authorDisplayName': '@riccardoripiccini5880',
      'authorProfileImageUrl': 'https://yt3.ggpht.com/ytc/AIf8zZTH3CkWJlnkkD8YCMEg9JoGvm-KiJhle66GKLQy=s48-c-k-c0x00ffffff-no-rj',
      'authorChannelUrl': 'http://www.youtube.com/channel/UC7wA8PR4LmqhA9gsfFAKDDg',
      'author

In [27]:
df_one_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   videoId            768 non-null    object             
 1   authorDisplayName  768 non-null    object             
 2   publishedAt        768 non-null    datetime64[ns, UTC]
 3   updatedAt          768 non-null    datetime64[ns, UTC]
 4   likeCount          768 non-null    int64              
 5   totalReplyCount    768 non-null    int64              
 6   textDisplay        768 non-null    object             
dtypes: datetime64[ns, UTC](2), int64(2), object(3)
memory usage: 42.1+ KB


In [28]:
# export to csv
df_one_video.to_csv("data/raw/ultimate_edition_trailer_comments.csv", index=False)

# Get comments function

In [20]:
def get_top_level_comments(youtube_service_object, video_ids):
    """Retrieves the commentThreads for a given YouTube video ID or list of video IDs.

    Parameters
    ----------
    video_ids : list or str
        A list of video IDs or a single string if only wanting to return data for one video ID.

    Returns
    --------
    df : dataframe
        A dataframe with the top level comments for the video ID.
    
    """

    # check if the video_ids input is a single string or list
    if isinstance(video_ids, str):
        video_ids = [video_ids]
  
    
    comments = []
    
    # loop through the remaining video ids, get the comments for these videos
    for index, video_id in enumerate(video_ids): 
        # make a request for first video id
        request = youtube_service_object.commentThreads().list(
            part="snippet",
            videoId=video_id,  
            order="time",
            maxResults=50
        )
        
        response = request.execute()
        
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append([
                comment['videoId'],
                comment['authorDisplayName'],
                comment['publishedAt'],
                comment['updatedAt'],
                comment['likeCount'],
                item['snippet']['totalReplyCount'], # reply count not stored in same section of json response as all the others        
                comment['textDisplay']
            ])
        
        next_page_token = response.get("nextPageToken", None)
        more_pages = True
            
        while more_pages == True:
            if next_page_token is None:
                more_pages = False
        
            else: 
                # make a request to the youtube api to get the next page results   
                request = youtube_service_object.commentThreads().list(
                    part="snippet",
                    videoId=video_id,
                    order="time",
                    maxResults=50,
                    pageToken=next_page_token,
                )
                
                response = request.execute()
        
                # loop through response and store data about each video in a dictionary
                for item in response['items']:
                    comment = item['snippet']['topLevelComment']['snippet']
                    comments.append([
                        comment['videoId'],
                        comment['authorDisplayName'],
                        comment['publishedAt'],
                        comment['updatedAt'],
                        comment['likeCount'],
                        item['snippet']['totalReplyCount'], # reply count not stored in same section of json response as all the others        
                        comment['textDisplay']
                    ])
        
                
                next_page_token = response.get("nextPageToken", None)         
    
        
        
        df = (pd.DataFrame(data=comments, columns=['videoId', 'authorDisplayName', 'publishedAt', 'updatedAt', 
                                                  'likeCount', 'totalReplyCount', 'textDisplay'])
                        .astype({"publishedAt": "datetime64[ns, UTC]", 
                                 "updatedAt": "datetime64[ns, UTC]", 
                                 "likeCount": "int64", 
                                 "totalReplyCount": "int64"})
                        .drop_duplicates(subset=["textDisplay"])
                        .sort_values(by=["publishedAt"])
                        .reset_index(drop=True)
                       )
    
    
    return df

# Test function

## Test - One video ID

In [21]:
# test with one id
df_test = get_top_level_comments(youtube_service_object=youtube, video_ids=ultimate_edition_trailer_id)
df_test.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,9jJiNa4HoD0,@aladdinleader1116,2023-12-19 14:00:38+00:00,2023-12-19 14:00:38+00:00,1,0,Wow🎉🎉🎉🎉🎉❤
1,9jJiNa4HoD0,@Awoken777,2023-12-19 14:01:03+00:00,2023-12-19 14:01:03+00:00,611,81,Tekken is literally the only Game i would buy an ultimate edition for
2,9jJiNa4HoD0,@gagagag7051,2023-12-19 14:01:21+00:00,2023-12-19 14:01:21+00:00,0,0,First
3,9jJiNa4HoD0,@UmoralCaratol,2023-12-19 14:01:23+00:00,2023-12-19 14:01:23+00:00,1,0,Clowning
4,9jJiNa4HoD0,@KodaiVibes,2023-12-19 14:01:26+00:00,2023-12-19 14:01:26+00:00,0,0,"UGH I WANT IT SOOOO BADDD, I NEED TO FIND A NEW JOB QUICK"


In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   videoId            768 non-null    object             
 1   authorDisplayName  768 non-null    object             
 2   publishedAt        768 non-null    datetime64[ns, UTC]
 3   updatedAt          768 non-null    datetime64[ns, UTC]
 4   likeCount          768 non-null    int64              
 5   totalReplyCount    768 non-null    int64              
 6   textDisplay        768 non-null    object             
dtypes: datetime64[ns, UTC](2), int64(2), object(3)
memory usage: 42.1+ KB


## Test - 3 video IDs

In [23]:
# test with 3 video ids
video_ids_3_test = ["9jJiNa4HoD0", "9D5vq-zq9y4", "rDxrpSqYHD8"]

df_3_videos_test = get_top_level_comments(youtube_service_object=youtube, video_ids=video_ids_3_test)
df_3_videos_test.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin


In [24]:
df_3_videos_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3022 entries, 0 to 3021
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   videoId            3022 non-null   object             
 1   authorDisplayName  3022 non-null   object             
 2   publishedAt        3022 non-null   datetime64[ns, UTC]
 3   updatedAt          3022 non-null   datetime64[ns, UTC]
 4   likeCount          3022 non-null   int64              
 5   totalReplyCount    3022 non-null   int64              
 6   textDisplay        3022 non-null   object             
dtypes: datetime64[ns, UTC](2), int64(2), object(3)
memory usage: 165.4+ KB


In [25]:
df_3_videos_test["videoId"].nunique()

3

In [26]:
df_3_videos_test.to_csv("data/raw/comments_3_videos_test.csv", index=False)

## Get comments for new characters reveal video

In [27]:
df.loc[:][["title", "commentCount", "videoId"]]

Unnamed: 0,title,commentCount,videoId
0,TEKKEN 8 – THE RETURN OF LEGENDS - NEW CHARACTERS REVEAL TRAILER,2809,rDxrpSqYHD8
1,TEKKEN 8 – Victor Chevalier Reveal & Gameplay Trailer,7289,cIDK50IaVpg
2,TEKKEN 8 — Reina Reveal & Gameplay Trailer,7538,PsCpewoF2E4
3,TEKKEN 8 — Leo Reveal & Gameplay Trailer,5153,QH6s_o3dIic
4,TEKKEN 8 – Steve Fox Reveal & Gameplay Trailer,4430,Zc-yMi05vBA
5,TEKKEN 8 — Dragunov Reveal & Gameplay Trailer,3290,ucesGynb2Yk
6,TEKKEN 8 — Yoshimitsu Reveal & Gameplay Trailer,4815,y8JGUIF2pu4
7,TEKKEN 8 Exclusive Story Demo Showcase,361,9D5vq-zq9y4
8,TEKKEN 8 – Official Story Trailer,4038,ToKJfywbe1o
9,TEKKEN 8 – Ultimate Edition Trailer,1219,9jJiNa4HoD0


In [18]:
# define video ids we want to get comments for
# filter dataframe to get only the row with the ultimate edition trailer video
df_filtered = df.loc[df["title"].str.lower().str.contains("characters reveal")]

df_filtered.head()

Unnamed: 0,channelTitle,channelId,videoId,publishedAt,title,description,tags,viewCount,likeCount,commentCount,favoriteCount
0,Bandai Namco Entertainment America,UC_ntXHv-XdKCD7CPynVvnQw,rDxrpSqYHD8,2023-11-01 16:09:18+00:00,TEKKEN 8 – THE RETURN OF LEGENDS - NEW CHARACTERS REVEAL TRAILER,"Five legends return in #TEKKEN8 for the next King of Iron Fist tournament!\n\nAnd we're not done yet. 👊 http://tekken.com\n\nTake a look at the latest fighters newly confirmed as playable in Tekken 8: Devil Jin, Zafina, Alisa Bosconovich, and Lee Chaolan (with a bonus appearance by Panda). The final characters will be revealed on November 2 and then November 12.","[Bandai Namco, Bandai Namco Entertainment, Video, Games, video games, namco bandai, United States, PS5, PS4, Xbox Series X]",867777,24594,2809,0


In [19]:
new_characters_reveal_id = df_filtered["videoId"][0]
new_characters_reveal_id

'rDxrpSqYHD8'

In [20]:
df_new_character_reveal_comments = get_top_level_comments(youtube_service_object=youtube, video_ids=characters_revealed_id)
df_new_character_reveal_comments.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin
...,...,...,...,...,...,...,...
2036,rDxrpSqYHD8,@muhammadrafaythaheem9731,2023-12-24 07:57:43+00:00,2023-12-24 07:57:43+00:00,0,0,Maaaarveroooosssse 🤣🤣🤣
2037,rDxrpSqYHD8,@helikoptergezgini9728,2023-12-25 21:27:41+00:00,2023-12-25 21:27:41+00:00,0,0,what you call new is in the game for almost all Tekken games. Are you kiddin me?!?! Nothin new here. Where is Eddy! Where are all the other great characters. I m just watching a sinking game. Too bad after Tekken 7 we got this. Its must be a joke...
2038,rDxrpSqYHD8,@pureOwarrior,2023-12-26 19:34:16+00:00,2023-12-26 19:34:16+00:00,0,0,Wished if this was Lee actual rage art :(
2039,rDxrpSqYHD8,@369dakuza,2023-12-27 20:24:12+00:00,2023-12-27 20:24:12+00:00,0,0,marduk? armor king? common... my mains aint in my main game? like wtf im not gonna buy until ltheir dlc come out.... very disappointed


In [21]:
df_new_character_reveal_comments.to_csv("data/raw/new_character_reveal_comments.csv", index=False)