<a href="https://colab.research.google.com/github/Jeanne-kony/youtube-comment-analysis/blob/main/youtube_comments_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install streamlit
!pip install google-api-python-client
!pip install matplotlib
!pip install wordcloud
!pip install ipadic
!pip install beautifulsoup4
!pip install mecab-python3
!pip install SudachiPy==0.5.4 ginza==4.0.6 ja-ginza==4.0.0
!pip install japanize-matplotlib
!pip install pyvis
!pip install streamlit openai
!pip install bertopic==0.16.0
!pip install beautifulsoup4

In [None]:
#パッケージ読み込みのエラーを回避
import pkg_resources, imp
imp.reload(pkg_resources)

In [None]:
%%writefile app.py

# 必要なライブラリのインポート
import streamlit as st
import json
import pandas as pd
import numpy as np
import unicodedata
import MeCab
from collections import Counter
import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import ipadic
import re
import os
import networkx as nx
import matplotlib.cm as cm
import japanize_matplotlib
from networkx.algorithms.community import greedy_modularity_communities
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import openai
from pathlib import Path
from bertopic import BERTopic
from googleapiclient.discovery import build
from bertopic.vectorizers import ClassTfidfTransformer
import yaml
from yaml.loader import SafeLoader
import streamlit_authenticator as stauth
from streamlit_authenticator.utilities.hasher import Hasher

# 日本語の自然言語処理モデルの読み込み
nlp = spacy.load('ja_ginza')

# APIキーなどの定数の定義
API_KEY = "YouTube Data API v3のAPIキーを入力してください"
OPENAI_API_KEY = "OPENAIのAPIキーを入力してください"
MAX_COMMENTS = 1000

# OpenAIとYouTube APIの設定
openai.api_key = OPENAI_API_KEY
youtube = build('youtube', 'v3', developerKey=API_KEY)

# YouTubeのコメントを取得する関数
def get_comments(video_id, pageToken):
    comment_url = "https://www.googleapis.com/youtube/v3/commentThreads"
    param = {
        "key": API_KEY,
        "videoId": video_id,
        "part": "replies,snippet",
        "maxResults": "100",
        "pageToken": pageToken if pageToken else None
    }
    req = requests.get(comment_url, params=param)
    return req.json()

# YouTube動画URLからビデオIDを抽出する関数
def extract_video_id(url):
    video_id_match = re.search(r'(?:v=|\/(?:embed|v|shorts)\/|youtu\.be\/)([^&\/?]+)', url)
    return video_id_match.group(1) if video_id_match else None

# HTMLタグを除去する関数
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

# テキストを形態素解析してトークン化する関数
def mecab_tokenizer(text):
    tagger = MeCab.Tagger(ipadic.MECAB_ARGS)
    replaced_text = preprocess_text(text)
    parsed_lines = tagger.parse(replaced_text).split("\n")[:-2]
    surfaces = [l.split("\t")[0] for l in parsed_lines]
    pos = [l.split("\t")[1].split(",")[0] for l in parsed_lines]
    target_pos = ["名詞", "動詞", "形容詞"]
    token_list = [t for t, p in zip(surfaces, pos) if p in target_pos]
    kana_re = re.compile("^[ぁ-ゖ]+$")  #い（一段、連用形）などを削除
    return [t for t in token_list if not kana_re.match(t)]

# テキストの前処理を行う関数
def preprocess_text(text):
    text = unicodedata.normalize("NFKC", text).upper()  #全ての文字を大文字化
    text = re.sub(r'[【】 () （） 『』　「」]', '', text)
    text = re.sub(r'[\[\］\]]', ' ', text)
    text = re.sub(r'[@＠]\w+', '', text)
    text = re.sub(r'\d+\.*\d*', '', text)
    return text

# 特定の品詞の単語を抽出する関数
def extract_words(sent, pos_tags):
    return [token.lemma_ for token in sent if token.pos_ in pos_tags and token.lemma_]

# 単語の共起関係を計算する関数
def count_cooccurrence(sents, token_length='{2,}'):
    count_model = CountVectorizer(token_pattern=f'\\b\\w{token_length}\\b') #1文字ではない英単語を抽出
    X = count_model.fit_transform(sents)
    words = count_model.get_feature_names_out()
    word_counts = np.asarray(X.sum(axis=0)).reshape(-1)
    X[X > 0] = 1 #単語の出現回数の差異を無視するためニ値化
    Xc = (X.T * X)
    return words, word_counts, Xc, X

# 共起ネットワークを作成する関数
def create_network(words, word_counts, Xc, weight_cutoff):
    G = nx.Graph()
    weights_w = [(word, {'weight': count / word_counts.max()}) for word, count in zip(words, word_counts)]
    G.add_nodes_from(weights_w)
    Xc_max = Xc.max()
    weights_c = [(words[i], words[j], Xc[i,j] / Xc_max) for i, j in zip(*Xc.nonzero()) if i < j and Xc[i,j] > weight_cutoff * Xc_max]
    G.add_weighted_edges_from(weights_c)
    G.remove_nodes_from(list(nx.isolates(G)))
    return G

# ネットワークを可視化する関数
def pyplot_network(G, layout, layout_parameter_k, weight_cutoff, node_size_, text_size):
    plt.figure(figsize=(25, 10), dpi=300)
    pos = get_layout(G, layout, layout_parameter_k)
    connecteds = list(greedy_modularity_communities(G))
    colors_array = cm.Pastel1(np.linspace(0.1, 0.9, len(connecteds)))
    node_colors = [next(colors_array[i] for i, c in enumerate(connecteds) if node in c) for node in G.nodes()]
    weights_n = np.array(list(nx.get_node_attributes(G, 'weight').values()))
    weights_e = np.array(list(nx.get_edge_attributes(G, 'weight').values()))
    nx.draw_networkx_nodes(G, pos, alpha=0.3, node_color=node_colors, node_size=node_size_ * weights_n)
    nx.draw_networkx_edges(G, pos, alpha=0.4, edge_color="whitesmoke", width=20 * weights_e)
    nx.draw_networkx_labels(G, pos, font_family='IPAexGothic', font_size=text_size)
    plt.axis("off")
    st.pyplot(plt)

# ネットワークのレイアウトを取得する関数
def get_layout(G, layout, layout_parameter_k):
    if layout == 'spring':
        return nx.spring_layout(G, k=layout_parameter_k, iterations=50, weight='weight')

# コメントから批判点を要約する関数
def summarize_criticisms(comments):
    prompt = f"以下のコメントから動画に対する批判の内容を総括してください。\n\n" + "\n".join(comments)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content

# メイン関数：Streamlitアプリケーションの主要ロジック
def main():
        st.title('YouTube Comments Analysis')
        video_url = st.text_input('Enter YouTube Video URL')

        if video_url:
            video_id = extract_video_id(video_url)
            if video_id:
                st.write(f'Extracted Video ID: {video_id}')
                comments = fetch_comments(video_id)
                if comments:
                    tab1, tab2, tab3 = st.tabs(["word cloud", "word co-occurrence networks","clustering"])
                    with tab1:
                      display_word_cloud(comments)
                    with tab2:
                      display_word_cooccurrence_network(comments)
                    with tab3:
                      display_topic_clustering(comments)
                    display_criticism_summary(comments)
                else:
                    st.write("No comments found or error occurred while fetching comments.")
            else:
                st.write('Invalid YouTube URL')

# コメントを取得する関数
def fetch_comments(video_id):
    comments = []
    pageToken = ""
    current_comments = 0
    with st.spinner('Fetching comments...'):
        progress_bar = st.progress(0)
        while pageToken is not None and current_comments < MAX_COMMENTS:
            req = get_comments(video_id, pageToken)
            if 'items' in req:
                for comment_thread in req["items"]:
                    if current_comments >= MAX_COMMENTS:
                        break
                    snippet = comment_thread["snippet"]["topLevelComment"]["snippet"]
                    comments.append(remove_html_tags(snippet["textDisplay"]))
                    current_comments += 1
                    progress_bar.progress(current_comments / MAX_COMMENTS)
                    if "replies" in comment_thread:
                        for reply in comment_thread["replies"]["comments"]:
                            if current_comments >= MAX_COMMENTS:
                                break
                            comments.append(remove_html_tags(reply["snippet"]["textDisplay"]))
                            current_comments += 1
                            progress_bar.progress(current_comments / MAX_COMMENTS)
                pageToken = req.get("nextPageToken")
            else:
                pageToken = None
    return comments

# ワードクラウドを表示する関数
def display_word_cloud(comments):
    words = ' '.join(mecab_tokenizer(' '.join(comments)))
    font_path = "フォントのパスを入力してください"
    stopwords_path = "stopwordsのパスを入力してください"
    with open(stopwords_path, "r") as f:
        stopwords = f.read().split("\n")

    wordcloud = WordCloud(
        background_color="white",
        width=800,
        height=800,
        font_path=font_path,
        colormap="viridis",
        stopwords=list(set(stopwords)),
        max_words=100,
    ).generate(words)

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(wordcloud, interpolation="bilinear")
    ax.axis("off")
    st.pyplot(fig)

# 単語の共起ネットワークを表示する関数
def display_word_cooccurrence_network(comments):
    words = ' '.join(mecab_tokenizer(' '.join(comments)))
    doc = preprocess_text(words)
    docs = [nlp(words)]

    include_pos = ('NOUN', 'PROPN')
    sents = [' '.join(extract_words(sent, include_pos)) for words in docs for sent in words.sents]
    words, word_counts, Xc, X = count_cooccurrence(sents, token_length='{1,}')

    G = create_network(words, word_counts, Xc, 0.025)
    pyplot_network(G, 'spring', 0.4, 0.025, 7500, 9)

# トピッククラスタリングを表示する関数
def display_topic_clustering(comments):
    vectorizer = CountVectorizer(tokenizer=mecab_tokenizer)
    model = BERTopic(
        embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
        vectorizer_model=vectorizer,
        language="japanese",
        calculate_probabilities=True,
        verbose=True,
        nr_topics="20"
    )
    topics, probs = model.fit_transform(comments)

    if len(set(topics)) > 1:
        fig = model.visualize_barchart()
        st.plotly_chart(fig)
    else:
        st.write("コメント数が不足しているため、ビジュアライゼーションができません。")

# 批判点の要約を表示する関数
def display_criticism_summary(comments):
    summary = summarize_criticisms(comments)
    st.subheader("批判点の総括:")
    st.write(summary)

# メイン関数の実行
if __name__ == '__main__':
    main()

In [None]:
!curl https://loca.lt/mytunnelpassword

In [None]:
#streamlitをlocaltunnelのトンネルで起動
#実行画面に出るyour url is:のurlに接続し、ブラウザの別タブに移動
#Click to Continueでstreamlitの動作を確認
#確認が終わったらこのセルの実行を停止する
!streamlit run app.py & sleep 3 && npx localtunnel --port 8501