In [8]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [9]:
# YouTube API key
API_KEY = "Your-api" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [10]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [11]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [12]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [13]:
participants = [ "페이커", "도란", "오너", "구마유시", "캐리아", "쇼메이커", "캐니언", "쵸비", "룰러", "기인", "데프트", "비디디", "커즈", "덕담", "제우스", "피넛", "제카", "바이퍼" ]

In [14]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_basic = "LCK"

for participant in tqdm.tqdm(participants):
    query = query_basic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  6%|▌         | 1/18 [00:06<01:43,  6.08s/it]

6.085813283920288s for query: LCK 페이커


 11%|█         | 2/18 [00:11<01:29,  5.59s/it]

11.322269201278687s for query: LCK 도란


 17%|█▋        | 3/18 [00:16<01:18,  5.25s/it]

16.167062282562256s for query: LCK 오너
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=rknFS4mB48w&maxResults=100&textFormat=plainText&key=AIzaSyDs8fo5xOHfIhmZ0OTcqhdG8dRqRciHBa8&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 22%|██▏       | 4/18 [00:21<01:14,  5.31s/it]

21.581918478012085s for query: LCK 구마유시


 28%|██▊       | 5/18 [00:27<01:10,  5.39s/it]

27.101588249206543s for query: LCK 캐리아


 33%|███▎      | 6/18 [00:31<01:01,  5.17s/it]

31.83569049835205s for query: LCK 쇼메이커


 39%|███▉      | 7/18 [00:36<00:53,  4.90s/it]

36.19765520095825s for query: LCK 캐니언


 44%|████▍     | 8/18 [00:40<00:46,  4.65s/it]

40.29316806793213s for query: LCK 쵸비
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=wYFtEq1MES0&maxResults=100&textFormat=plainText&key=AIzaSyDs8fo5xOHfIhmZ0OTcqhdG8dRqRciHBa8&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 50%|█████     | 9/18 [00:44<00:41,  4.62s/it]

44.863505363464355s for query: LCK 룰러


 56%|█████▌    | 10/18 [00:48<00:35,  4.46s/it]

48.95164179801941s for query: LCK 기인


 61%|██████    | 11/18 [00:53<00:31,  4.52s/it]

53.61792802810669s for query: LCK 데프트


 67%|██████▋   | 12/18 [00:57<00:26,  4.35s/it]

57.57283067703247s for query: LCK 비디디
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=16nccHJjvv8&maxResults=100&textFormat=plainText&key=AIzaSyDs8fo5xOHfIhmZ0OTcqhdG8dRqRciHBa8&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 72%|███████▏  | 13/18 [01:01<00:21,  4.31s/it]

61.80409264564514s for query: LCK 커즈


 78%|███████▊  | 14/18 [01:05<00:16,  4.14s/it]

65.5553822517395s for query: LCK 덕담


 83%|████████▎ | 15/18 [01:09<00:12,  4.21s/it]

69.9339325428009s for query: LCK 제우스


 89%|████████▉ | 16/18 [01:13<00:08,  4.09s/it]

73.7488386631012s for query: LCK 피넛
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=Gl6ggNBrSj0&maxResults=100&textFormat=plainText&key=AIzaSyDs8fo5xOHfIhmZ0OTcqhdG8dRqRciHBa8&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 94%|█████████▍| 17/18 [01:17<00:03,  3.92s/it]

77.278888463974s for query: LCK 제카


100%|██████████| 18/18 [01:21<00:00,  4.51s/it]

81.16586709022522s for query: LCK 바이퍼





# Merge youtube_comments with movie_rating_dataset

In [15]:
comments = pd.read_csv("youtube_comments.csv")

In [16]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,yhrIGG4Uk8g,미쳒다
1,yhrIGG4Uk8g,봐도모르겠넹 페이커캐릭이먼지
2,yhrIGG4Uk8g,롤이라는 게임이 유명한 게임이라는것도 도움이 되었겠지만 우리나라 사람들이 게임 진짜...
3,yhrIGG4Uk8g,전략은 피지컬을 이긴다
4,yhrIGG4Uk8g,클템 해설 존나 듣기 싫다


In [17]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x78c658c0cc10>)

In [18]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [19]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,yhrIGG4Uk8g,미쳒다
1,yhrIGG4Uk8g,봐도모르겠넹 페이커캐릭이먼지
2,yhrIGG4Uk8g,롤이라는 게임이 유명한 게임이라는것도 도움이 되었겠지만 우리나라 사람들이 게임 진짜...
3,yhrIGG4Uk8g,전략은 피지컬을 이긴다
4,yhrIGG4Uk8g,클템 해설 존나 듣기 싫다


In [20]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

movie data length: 200000
comments data length: 39116


In [21]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

Unnamed: 0,text
0,어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산..."
2,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
3,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...
4,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
...,...
239111,침착해 큰 거 온 다 (원딜의신 라이브 기원)
239112,"0:29 누군가를 ""꼭"" 집어 말하지 않고 ""비틀어"" 말하네요"
239113,나는 바이퍼 원딜의 신 신선생 협곡 탈주메이커 상대편 원딜을 흥건하게 만들어버리는 ...
239114,원하는 건 복기가 아니고 앵콜이라고 ㅋㅋㅋ 아


In [22]:
# NULL check
print(merged_df.isnull().values.any())

True


In [23]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

False


In [24]:
print(len(merged_df)) 

239107


In [25]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)


In [26]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.1/494.1 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.2 konlpy-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
from konlpy.tag import Okt
okt = Okt()

In [28]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

True
False


In [29]:
from konlpy.tag import Okt
import tqdm

okt = Okt()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
tokenized_data = []

for sentence in tqdm.tqdm(comments['Comment']):
    sent = str(sentence).strip()
    if not sent:
        continue

    morphs = okt.morphs(sent, stem=True)
    filtered = [
        w for w in morphs
        if w not in stopwords
           and len(w) >= 2
           and w.isalpha()
    ]
    # — 만약 원문에 '쵸비'가 있는데 토큰화에서 빠졌으면, 강제로 추가
    if '쵸비' in sent and '쵸비' not in filtered:
        filtered.append('쵸비')

    if filtered:
        tokenized_data.append(filtered)

100%|██████████| 39115/39115 [01:08<00:00, 571.66it/s] 


In [30]:
pip install gensim

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
nilearn 0.1

In [31]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=tokenized_data,
    vector_size=100,
    window=5,
    min_count=1,    
    workers=4,
    sg=0           
)

In [32]:
vocab_size = len(model.wv.key_to_index) 

In [33]:
vector_dim = model.wv.vectors.shape[1]

In [34]:
print((vocab_size, vector_dim))

(22074, 100)


In [35]:
model.wv.save_word2vec_format('ko_w2v')

In [36]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

In [37]:
from IPython.display import FileLink

FileLink("ko_w2v_metadata.tsv")

In [38]:
FileLink("ko_w2v_metadata.tsv")

# Visualization for embedding

In [40]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv

In [41]:
# 전체 Word2Vec 모델 저장 (Gensim 형식)
model.save('ko_w2v.model')
# 또는 벡터만 텍스트/바이너리 포맷으로 저장
model.wv.save_word2vec_format('ko_w2v.bin', binary=True)