In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [75]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [None]:
# YouTube API key
API_KEY = "Your-API-key"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [77]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [78]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [79]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [80]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]


In [81]:
video_comments = {}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments) 

  5%|▌         | 1/20 [00:05<01:49,  5.74s/it]

5.748027086257935s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:11<01:38,  5.48s/it]

11.048986673355103s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:15<01:28,  5.23s/it]

15.968203067779541s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:20<01:20,  5.05s/it]

20.74695897102356s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:25<01:12,  4.82s/it]

25.150463104248047s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [00:29<01:07,  4.79s/it]

29.89877963066101s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [00:34<01:00,  4.68s/it]

34.360440731048584s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [00:40<01:03,  5.29s/it]

40.9348521232605s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [00:45<00:54,  5.00s/it]

45.29467010498047s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [00:49<00:47,  4.76s/it]

49.53543281555176s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [00:54<00:42,  4.74s/it]

54.214513540267944s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [00:59<00:38,  4.76s/it]

59.03972578048706s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [01:04<00:34,  4.86s/it]

64.11499118804932s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [01:07<00:26,  4.50s/it]

67.77390742301941s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [01:11<00:21,  4.38s/it]

71.88321709632874s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [01:15<00:17,  4.28s/it]

75.94300413131714s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [01:20<00:12,  4.31s/it]

80.31026434898376s for query: 흑백요리사 이영숙
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=_yOU-oKKSXg&maxResults=100&textFormat=plainText&key=AIzaSyAGafvTbW8FwUDH5aZVTb7nxuw-ZumcK-U&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 90%|█████████ | 18/20 [01:24<00:08,  4.32s/it]

84.67224431037903s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [01:28<00:04,  4.19s/it]

88.53426885604858s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [01:32<00:00,  4.63s/it]

92.54286623001099s for query: 흑백요리사 박준우





In [97]:
comments = pd.read_csv("youtube_comments.csv")

In [98]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7b84ee79a170>)

In [99]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [100]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [88]:
from konlpy.tag import Okt
okt = Okt()

In [101]:
print(comments.isnull().values.any())
comments = comments.dropna(how = 'any') # drop rows with null values
print(comments.isnull().values.any())

True
False


In [106]:
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']

tokenized_data = []
for sentence in tqdm.tqdm(comments['Comment']):  # merged_df['text']
    sentence = str(sentence)  # 스트링으로 자료형으로 변경 
    
    if not sentence:
        continue

    tokenized_sentence = okt.morphs(sentence, stem=True)  # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if word not in stopwords and len(word) >= 2 and word.isalpha()]  # 불용어 제거 및 조건 추가

    if stopwords_removed_sentence:
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 44055/44055 [03:18<00:00, 222.46it/s]


In [107]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [108]:
model.wv.vectors.shape

(6648, 100)

In [109]:
print(model.wv.most_similar("백종원"))

[('성재', 0.9100303053855896), ('기준', 0.8862870931625366), ('안성', 0.885806679725647), ('한테', 0.8565773963928223), ('블라인드', 0.854834258556366), ('평가', 0.854598343372345), ('정확', 0.8433785438537598), ('재는', 0.8389924764633179), ('램지', 0.8366859555244446), ('의원', 0.833152174949646)]


In [110]:
print(model.wv.most_similar("최현석"))

[('정지선', 0.938397228717804), ('셰프', 0.9277676343917847), ('여경', 0.9238823056221008), ('성재', 0.9130982160568237), ('안유', 0.8952603340148926), ('쉐프', 0.8849378824234009), ('이랑', 0.8802188038825989), ('제자', 0.8703275322914124), ('이영숙', 0.8688126802444458), ('헤드', 0.8636547923088074)]


In [111]:
model.wv.save_word2vec_format('ko_w2v')

In [112]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

In [113]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv