In [26]:
import os
import json
import re
import datetime
from collections import Counter
import nltk
import matplotlib
matplotlib.use("Agg")
from youtube_comment_downloader import YoutubeCommentDownloader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

nltk.download("stopwords")

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

def fetch_comments(video_id: str, max_comments: int = 1000):
    """
    Step 1: Download up to max_comments from YouTube video and save to raw_comments.json.
    P.S: I do NOT pass sort_by_time here, to avoid the TypeError.
    """
    downloader = YoutubeCommentDownloader()
    comments = []
    
    for c in downloader.get_comments(video_id):
        comments.append({
            "author": c["author"],
            "text": c["text"],
            "time": c["time"],  
        })
        if len(comments) >= max_comments:
            break
    with open(f"{DATA_DIR}/raw_comments.json", "w", encoding="utf-8") as f:
        json.dump(comments, f, ensure_ascii=False, indent=2)
    print(f"[1/5] Fetched {len(comments)} comments.")

def clean_text(text: str) -> str:
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower().strip()
    stops = set(nltk.corpus.stopwords.words("english"))
    return " ".join(tok for tok in text.split() if tok not in stops)


def preprocess():
    """Step 2: Read raw_comments.json → add clean_text → write clean_comments.json."""
    with open(f"{DATA_DIR}/raw_comments.json", encoding="utf-8") as f:
        data = json.load(f)
    for c in data:
        c["clean_text"] = clean_text(c["text"])
    with open(f"{DATA_DIR}/clean_comments.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print("[2/5] Preprocessing done.")


def analyze_sentiment():
    """Step 3: Read clean_comments.json → VADER analysis → write sentiment_comments.json."""
    analyzer = SentimentIntensityAnalyzer()
    with open(f"{DATA_DIR}/clean_comments.json", encoding="utf-8") as f:
        data = json.load(f)
    for c in data:
        c["sentiment"] = analyzer.polarity_scores(c["clean_text"])
    with open(f"{DATA_DIR}/sentiment_comments.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print("[3/5] Sentiment analysis done.")

def load_data():
    with open(f"{DATA_DIR}/sentiment_comments.json", encoding="utf-8") as f:
        return json.load(f)

def sentiment_distribution(data):
    dist = Counter()
    for c in data:
        comp = c["sentiment"]["compound"]
        if comp >= 0.05:      dist["positive"] += 1
        elif comp <= -0.05:   dist["negative"] += 1
        else:                 dist["neutral"]  += 1
    return dist

def plot_sentiment(dist):
    labels, counts = zip(*dist.items())
    plt.figure()
    plt.bar(labels, counts)
    plt.title("Sentiment Distribution")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(f"{DATA_DIR}/sentiment_dist.png")
    plt.close()
    print("[4a/5] Saved sentiment_dist.png")

def length_distribution(data):
    lengths = [len(c["clean_text"].split()) for c in data]
    plt.figure()
    plt.hist(lengths, bins=20)
    plt.title("Comment Length Distribution")
    plt.xlabel("Number of Tokens")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(f"{DATA_DIR}/length_dist.png")
    plt.close()
    print("[4b/5] Saved length_dist.png")


def time_series(data):
    """
    Step 4: Try to create a time series plot of comment counts per day.
    """
    valid_dates = []
    for c in data:
        try:
            # Convert milliseconds to seconds, then to datetime.date
            ts = int(c["time"]) / 1000
            date = datetime.datetime.fromtimestamp(ts).date()
            valid_dates.append(date)
        except (KeyError, TypeError, ValueError):
            continue  

    if not valid_dates:
        print("⚠️ No valid timestamps found. Skipping time_series plot.")
        return

    # Count comments per date
    date_counts = Counter(valid_dates)
    sorted_dates = sorted(date_counts.items())

    x, y = zip(*sorted_dates)

    plt.figure(figsize=(10, 5))
    plt.plot(x, y, marker='o', linestyle='-')
    plt.title("Comments Over Time")
    plt.xlabel("Date")
    plt.ylabel("Number of Comments")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{DATA_DIR}/time_series.png")
    plt.close()
    print("[4c/5] Saved time_series.png")

def plot_top_keywords(data, top_n=10):
    """
    Step 4d/5: Plot top N keywords from clean_text.
    """
    # 1. Gather all tokens
    tokens = []
    for c in data:
        tokens.extend(c["clean_text"].split())
    # 2. Get the top N most common
    freq = Counter(tokens).most_common(top_n)
    words, counts = zip(*freq)

    # 3. Plot & save
    plt.figure(figsize=(10, 5))
    plt.bar(words, counts)
    plt.title(f"Top {top_n} Keywords in Comments")
    plt.xlabel("Keyword")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{DATA_DIR}/top_keywords.png")
    plt.close()
    print("[4d/5] Saved top_keywords.png")

def top_keywords(data, top_n=10):
    tokens = []
    for c in data:
        tokens.extend(c["clean_text"].split())
    return Counter(tokens).most_common(top_n)

def main():
    video_id = "fK85SQzm0Z0"
    fetch_comments(video_id)
    preprocess()
    analyze_sentiment()

    data = load_data()
    dist = sentiment_distribution(data)
    print(f"[5/5] Sentiment counts = {dict(dist)}")

    plot_sentiment(dist)
    length_distribution(data)
    time_series(data)
    plot_top_keywords(data)         

    top10 = top_keywords(data)
    print("Top 10 keywords:", top10)


if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\82154\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1/5] Fetched 1000 comments.
[2/5] Preprocessing done.
[3/5] Sentiment analysis done.
[5/5] Sentiment counts = {'positive': 289, 'neutral': 587, 'negative': 124}
[2/5] Preprocessing done.
[3/5] Sentiment analysis done.
[5/5] Sentiment counts = {'positive': 289, 'neutral': 587, 'negative': 124}
[4a/5] Saved sentiment_dist.png
[4b/5] Saved length_dist.png
⚠️ No valid timestamps found. Skipping time_series plot.
[4a/5] Saved sentiment_dist.png
[4b/5] Saved length_dist.png
⚠️ No valid timestamps found. Skipping time_series plot.
[4d/5] Saved top_keywords.png
Top 10 keywords: [('china', 214), ('streams', 202), ('korea', 180), ('speed', 154), ('chinese', 122), ('people', 115), ('world', 93), ('speeds', 84), ('country', 74), ('like', 71)]
[4d/5] Saved top_keywords.png
Top 10 keywords: [('china', 214), ('streams', 202), ('korea', 180), ('speed', 154), ('chinese', 122), ('people', 115), ('world', 93), ('speeds', 84), ('country', 74), ('like', 71)]


# Emotion Analysis of YouTube Comments
This section analyzes the emotions present in the comments using the NRC Emotion Lexicon.

In [27]:
# Download NRC Emotion Lexicon if not present
import pandas as pd
import requests
NRC_URL = "https://raw.githubusercontent.com/words/lexicon/master/nrc/nrc.txt"
NRC_PATH = os.path.join(DATA_DIR, "nrc_lexicon.txt")
if not os.path.exists(NRC_PATH):
    try:
        r = requests.get(NRC_URL)
        with open(NRC_PATH, "w", encoding="utf-8") as f:
            f.write(r.text)
    except Exception as e:
        print("Could not download NRC lexicon:", e)

In [28]:
# Load NRC Lexicon
def load_nrc_lexicon():
    lexicon = {}
    with open(NRC_PATH, encoding="utf-8") as f:
        for line in f:
            word, emotion, value = line.strip().split("\t")
            if int(value) == 1:
                if word not in lexicon:
                    lexicon[word] = set()
                lexicon[word].add(emotion)
    return lexicon

In [29]:
# Analyze emotions in comments
def analyze_emotions(data):
    lexicon = load_nrc_lexicon()
    emotions = [
        "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"
    ]
    emotion_counter = Counter({e: 0 for e in emotions})
    for c in data:
        tokens = c["clean_text"].split()
        for tok in tokens:
            if tok in lexicon:
                for emo in lexicon[tok]:
                    if emo in emotion_counter:
                        emotion_counter[emo] += 1
    return emotion_counter

In [30]:
# Plot emotion distribution
def plot_emotions(emotion_counts):
    emotions, counts = zip(*emotion_counts.items())
    plt.figure(figsize=(10, 5))
    bars = plt.bar(emotions, counts, color="purple")
    for bar, count in zip(bars, counts):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{count}', ha='center', va='bottom', fontsize=9, color='blue')
    plt.title("Emotion Distribution in Comments")
    plt.xlabel("Emotion")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(f"{DATA_DIR}/emotion_dist.png")
    plt.close()
    print("Saved emotion_dist.png")

In [31]:
# Example: Run emotion analysis after loading data
data = load_data()
emotion_counts = analyze_emotions(data)
print("Emotion counts:", emotion_counts)
plot_emotions(emotion_counts)

ValueError: not enough values to unpack (expected 3, got 1)

# Topic Modeling of YouTube Comments
Discover the main discussion topics in the comments using Latent Dirichlet Allocation (LDA).

In [37]:
# Install required packages if needed
try:
    import gensim
    from gensim import corpora
except ImportError:
    import sys
    !{sys.executable} -m pip install gensim
    import gensim
    from gensim import corpora

In [43]:
# Topic modeling with LDA
def run_lda_topic_modeling(data, num_topics=5, num_words=7):
    # Prepare documents
    documents = [c['clean_text'].split() for c in data]
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    # Train LDA model
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    topics = lda_model.print_topics(num_words=num_words)
    for topic in topics:
        print(f'Topic: {topic}')
    return topics

In [44]:
# Visualize LDA topics as a bar chart
def plot_lda_topics(topics):
    topic_labels = []
    topic_words = []
    for idx, topic in enumerate(topics):
        topic_str = topic[1] if isinstance(topic, tuple) else topic
        # Extract words from topic string
        words = [w.split('*')[1].replace('"','').strip() for w in topic_str.split(' + ')]
        topic_labels.append(f'Topic {idx+1}')
        topic_words.append(', '.join(words))
    plt.figure(figsize=(10, 5))
    bars = plt.bar(topic_labels, [1]*len(topic_labels), color='teal')
    for bar, words in zip(bars, topic_words):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), words, ha='center', va='bottom', fontsize=10, color='blue', rotation=45)
    plt.title('LDA Topics (Top Words per Topic)')
    plt.xlabel('Topic')
    plt.ylabel('Relative Importance (for display only)')
    plt.tight_layout()
    plt.savefig(f'{DATA_DIR}/lda_topics.png')
    plt.close()
    print('Saved lda_topics.png')

In [45]:
# Example: Run topic modeling and plot topics after loading data
data = load_data()
topics = run_lda_topic_modeling(data, num_topics=5, num_words=7)
plot_lda_topics(topics)

Topic: (0, '0.035*"streams" + 0.033*"korea" + 0.014*"speeds" + 0.013*"koreans" + 0.013*"chinese" + 0.013*"world" + 0.011*"irl"')
Topic: (1, '0.030*"santai" + 0.028*"china" + 0.020*"speed" + 0.013*"w" + 0.012*"come" + 0.010*"like" + 0.007*"stream"')
Topic: (2, '0.017*"china" + 0.015*"speed" + 0.014*"people" + 0.009*"bro" + 0.007*"chinese" + 0.007*"even" + 0.006*"love"')
Topic: (3, '0.019*"china" + 0.013*"jackson" + 0.010*"wang" + 0.009*"speed" + 0.008*"people" + 0.006*"girl" + 0.005*"bro"')
Topic: (4, '0.032*"china" + 0.030*"speed" + 0.011*"dont" + 0.010*"chinese" + 0.007*"im" + 0.006*"ishowspeed" + 0.006*"bro"')
Saved lda_topics.png


In [46]:
# Save LDA topics graph as HTML
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.backends.backend_svg import FigureCanvasSVG
def save_lda_topics_html(topics):
    import base64
    import io
    fig, ax = plt.subplots(figsize=(10, 5))
    topic_labels = []
    topic_words = []
    for idx, topic in enumerate(topics):
        topic_str = topic[1] if isinstance(topic, tuple) else topic
        words = [w.split('*')[1].replace('"','').strip() for w in topic_str.split(' + ')]
        topic_labels.append(f'Topic {idx+1}')
        topic_words.append(', '.join(words))
    bars = ax.bar(topic_labels, [1]*len(topic_labels), color='teal')
    for bar, words in zip(bars, topic_words):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), words, ha='center', va='bottom', fontsize=10, color='blue', rotation=45)
    ax.set_title('LDA Topics (Top Words per Topic)')
    ax.set_xlabel('Topic')
    ax.set_ylabel('Relative Importance (for display only)')
    fig.tight_layout()
    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    html = f'''<html><head><title>LDA Topics</title></head><body><h2>LDA Topics (Top Words per Topic)</h2><img src="data:image/png;base64,{img_base64}"/></body></html>'''
    with open(f'{DATA_DIR}/lda_topics.html', 'w', encoding='utf-8') as f:
        f.write(html)
    plt.close(fig)
    print('Saved lda_topics.html')

In [47]:
# Example: Run topic modeling and save HTML after loading data
data = load_data()
topics = run_lda_topic_modeling(data, num_topics=5, num_words=7)
save_lda_topics_html(topics)

Topic: (0, '0.035*"streams" + 0.033*"korea" + 0.014*"speeds" + 0.013*"koreans" + 0.013*"chinese" + 0.013*"world" + 0.011*"irl"')
Topic: (1, '0.030*"santai" + 0.028*"china" + 0.020*"speed" + 0.013*"w" + 0.012*"come" + 0.010*"like" + 0.007*"stream"')
Topic: (2, '0.017*"china" + 0.015*"speed" + 0.014*"people" + 0.009*"bro" + 0.007*"chinese" + 0.007*"even" + 0.006*"love"')
Topic: (3, '0.019*"china" + 0.013*"jackson" + 0.010*"wang" + 0.009*"speed" + 0.008*"people" + 0.006*"girl" + 0.005*"bro"')
Topic: (4, '0.032*"china" + 0.030*"speed" + 0.011*"dont" + 0.010*"chinese" + 0.007*"im" + 0.006*"ishowspeed" + 0.006*"bro"')
Saved lda_topics.html
