In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import locale
from datetime import datetime
from datetime import timedelta
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import ipywidgets as widgets
from IPython.display import display, clear_output
from transformers import pipeline


# Download NLTK Ressourcen
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("stopwords")

# Set matplotlib to use English for dates
try:
    locale.setlocale(locale.LC_TIME, 'en_US.UTF-8')
except locale.Error:
    locale.setlocale(locale.LC_TIME, 'C')

# Load and prepare data
df = pd.read_csv("../data/comments_post.csv")

# Basic cleaning
df = df[df["text_comment"].notna() & (df["playCount"] > 0)].copy()
df["text_comment"] = df["text_comment"].astype(str).str.lower()

# Parse datetime and remove timezone (tz-naive)
df["createTimeISO"] = pd.to_datetime(df["createTimeISO"]).dt.tz_localize(None)

# Add week column
df["week"] = df["createTimeISO"].dt.to_period("W").dt.start_time

[nltk_data] Downloading package punkt to C:\Users\Surface/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Surface/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Surface/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df = pd.read_csv("../data/comments_post.csv")


In [2]:
# Define standard and custom stopwords
standard_stopwords = set(stopwords.words("german")) | set(stopwords.words("english"))
custom_stopwords = {
    "i", "you", "it", "me", "this", "that", "we", "she", "he", "they", "u", "ur",
    "my", "your", "yours", "ours", "their", "its", "use", "like", "get", "need", 
    "please", "one", "would", "watch", "want", "que", "tbh", "idk", "fr", 
    "literally", "actually", "honestly", "kinda", "thing", "stuff", "good", "bad", 
    "😭", "😂", "✨", "💀", "💅", "🥰", "😩", "😅", "🥺", "😳"
}

# Unigram-specific stopwords (standard + custom)
combined_stopwords = standard_stopwords | custom_stopwords

Engagement per skincare influencer

In [3]:
# Calculate engagement
df["engagement"] = df["diggCount"] + df["commentCount"] + df["shareCount"]

# Set up interactive widgets (standardized date range 6 months)
max_date = df["createTimeISO"].max().date()
six_months_ago = max_date - timedelta(days=180)

start_date = widgets.DatePicker(description='Start Date', value=six_months_ago)
end_date = widgets.DatePicker(description='End Date', value=max_date)
min_posts_slider = widgets.IntSlider(value=3, min=1, max=20, step=1, description='Min Posts')

# Define interactive update function
def update_plot(start, end, min_posts):
    if start is None or end is None:
        print("Please select both start and end date.")
        return

    start = pd.to_datetime(start)
    end = pd.to_datetime(end)

    # Filter data by date range
    mask = (df["createTimeISO"] >= start) & (df["createTimeISO"] <= end)
    filtered_df = df[mask].copy()

    if filtered_df.empty:
        print("No data available for selected date range.")
        return

    # Group by influencer
    grouped = filtered_df.groupby("author_nickName").agg({
        "engagement": "sum",
        "playCount": "sum",
        "author_fans": "first",
        "id": "count"
    }).rename(columns={"id": "num_posts"}).reset_index()

    # Filter by minimum number of posts
    grouped = grouped[grouped["num_posts"] >= min_posts]

    if grouped.empty:
        print(f"No influencers with at least {min_posts} posts in this period.")
        return

    # Sort and select top 10
    top = grouped.sort_values(by="engagement", ascending=False).head(10)

    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(top["author_nickName"], top["engagement"])
    plt.xlabel("Total Engagement")
    plt.ylabel("Influencer")
    plt.title(f"Top 10 Influencers by Engagement\n({start.date()} to {end.date()}, Min Posts: {min_posts})")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# Display interactive widgets
widgets.interact(update_plot, start=start_date, end=end_date, min_posts=min_posts_slider);

interactive(children=(DatePicker(value=datetime.date(2024, 9, 30), description='Start Date', step=1), DatePick…

Spam phrases, viral memes phrases

In [4]:
# Ensure all necessary columns are included from the start
comments_df = df[df["text_comment"].notna()][["text_comment", "author_nickName", "createTimeISO"]].copy()

# Basic text cleaning: lowercase and strip whitespace
comments_df["text_comment_clean"] = comments_df["text_comment"].str.strip().str.lower()

# Remove exact duplicate comments
comments_df = comments_df.drop_duplicates(subset=["text_comment_clean"])

# Define known spam/meme phrases to filter out
spam_phrases = [
    "speed made you famous",
    "amy made speed",
    "speed speed",
    "famous speed made",
    "made speed famous",
    "speed made",
    "speed"
]
spam_pattern = "|".join(re.escape(p) for p in spam_phrases)

# Exclude comments that match any of the spam patterns
filtered_comments_df = comments_df[~comments_df["text_comment_clean"].str.contains(spam_pattern)]

# Optionally: filter out comments that contain 'speed' more than once
filtered_comments_df["speed_count"] = filtered_comments_df["text_comment_clean"].str.count(r"\bspeed\b")
filtered_comments_df = filtered_comments_df[filtered_comments_df["speed_count"] <= 1]

# Clean up: remove the helper column
filtered_comments_df = filtered_comments_df.drop(columns=["speed_count"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_comments_df["speed_count"] = filtered_comments_df["text_comment_clean"].str.count(r"\bspeed\b")


Common phrases in comments per influencer

In [5]:
# Text input for influencer name
influencer_input = widgets.Text(
    description="Influencer:",
    placeholder="Enter name, e.g. amyflamy"
)

# Date pickers
today = df["createTimeISO"].max().date()
six_months_ago = today - timedelta(days=180)
start_ngram = widgets.DatePicker(description="Start Date", value=six_months_ago)
end_ngram = widgets.DatePicker(description="End Date", value=today)

# Function for dynamic bigram/trigram output
def show_ngrams_manual(influencer, start, end):
    clear_output(wait=True)

    if not influencer or start is None or end is None:
        print("Please enter an influencer name and valid dates.")
        return

    # Filter comments
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)

    df_filtered = filtered_comments_df[
        (filtered_comments_df["author_nickName"] == influencer) &
        (filtered_comments_df["createTimeISO"] >= start) &
        (filtered_comments_df["createTimeISO"] <= end)
    ]

    if df_filtered.empty:
        print(f"No comments found for '{influencer}' in selected time range.")
        return

    # Tokenize and clean
    all_text = " ".join(df_filtered["text_comment"].dropna())
    tokens = [
        word for word in word_tokenize(all_text.lower())
        if re.match(r'^[a-z]{3,}$', word) and word not in combined_stopwords
    ]

    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))

    bigram_freq = Counter(bigrams).most_common(20)
    trigram_freq = Counter(trigrams).most_common(20)

    # Output
    print(f"Top 20 Bigrams for '{influencer}' ({start.date()} to {end.date()}):")
    for bg in bigram_freq:
        print("  ", " ".join(bg[0]), "-", bg[1])

    print(f"\nTop 20 Trigrams for '{influencer}':")
    for tg in trigram_freq:
        print("  ", " ".join(tg[0]), "-", tg[1])

# Create interactive output
out = widgets.interactive_output(
    show_ngrams_manual,
    {'influencer': influencer_input, 'start': start_ngram, 'end': end_ngram}
)

# Pack widgets in Layout
ui = widgets.VBox([influencer_input, start_ngram, end_ngram])

# Display UI and Output
display(ui, out)

VBox(children=(Text(value='', description='Influencer:', placeholder='Enter name, e.g. amyflamy'), DatePicker(…

Output()

In [6]:
# Choose influencer
influencer_name = "amyflamy"

# Filter cleaned, spam-free comments from the selected influencer
influencer_comments = filtered_comments_df[filtered_comments_df["author_nickName"] == influencer_name]

# Combine all comments into a single string
all_text = " ".join(influencer_comments["text_comment"].dropna())

# Tokenize and filter tokens using combined stopwords
tokens = [
    word for word in word_tokenize(all_text.lower())
    if re.match(r'^[a-z]{3,}$', word) and word not in combined_stopwords
]

# Create bigrams and trigrams
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

# Count most common ones
bigram_freq = Counter(bigrams).most_common(20)
trigram_freq = Counter(trigrams).most_common(20)

# Display results
print("Top 20 Bigrams:")
for bg in bigram_freq:
    print(" ".join(bg[0]), "-", bg[1])

print("\nTop 20 Trigrams:")
for tg in trigram_freq:
    print(" ".join(tg[0]), "-", tg[1])

Top 20 Bigrams:
korean skincare - 11
without makeup - 9
cleansing oil - 7
song kang - 6
skin care - 5
take care - 5
skincare routine - 4
beautiful without - 4
dry skin - 4
white fungus - 4
hair routine - 3
much better - 3
love videos - 3
amy pretty - 3
got girl - 3
relief cream - 3
anyone grwm - 3
oily skin - 3
skincare products - 3
clear skin - 3

Top 20 Trigrams:
beautiful without makeup - 4
hair perfume oil - 3
winter shade summer - 2
korean skincare really - 2
love korean skincare - 2
vaseline clog pores - 2
content creator easy - 2
creator easy job - 2
oil instead cleansing - 2
instead cleansing oil - 2
love white fungus - 2
white fungus mushrooms - 2
cant believe got - 1
believe got meet - 1
got meet heize - 1
meet heize see - 1
heize see cried - 1
see cried girl - 1
cried girl hair - 1
girl hair routine - 1
