# YouTube & TikTok Trends — Interactive EDA Dashboard (Dash)

This notebook contains a fully runnable Dash app that reads the CSV and renders the 5-question exploration with interactive filters.

**Data path**: set `DATA_PATH` env var or place the file at `./data/youtube_shorts_tiktok_trends_2025.csv`.


In [55]:
import os, numpy as np, pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import dash
from dash import dcc, html, Input, Output, State

px.defaults.template = "plotly_white"
px.defaults.width = 1150
px.defaults.height = 700

def make_sparse_marks(vmin, vmax, step=10, fmt=str):
    vmin = int(np.floor(vmin)); vmax = int(np.ceil(vmax))
    return {v: fmt(v) for v in range(vmin, vmax + 1, step)}

def fmt_si(x):
    try:
        return f"{pd.to_numeric(x):~s}"  # will be used mainly for tickformat via Plotly
    except Exception:
        return str(x)


In [56]:
DATA_PATH = Path("../data/youtube_shorts_tiktok_trends_2025.csv").resolve()


print("Using CSV:", DATA_PATH)
if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"CSV not found at: {DATA_PATH}\n"
        "Check your project structure: Viz_Project/code (this notebook) and Viz_Project/data (CSV)."
    )

df = pd.read_csv(DATA_PATH)

# --- basic casting to ensure numeric dtypes ---------------------------------
num_cols = [
    "duration_sec","views","likes","comments","shares","saves","engagement_rate",
    "engagement_like_rate","engagement_comment_rate","engagement_share_rate",
    "avg_watch_time_sec","completion_rate","upload_hour","trend_duration_days",
    "engagement_velocity"
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# --- normalize publish_dayofweek & weekend ----------------------------------
WEEK_ORDER = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

if "publish_dayofweek" in df.columns:
    s = df["publish_dayofweek"]

    if pd.api.types.is_numeric_dtype(s):
        mapping = {0:"Monday",1:"Tuesday",2:"Wednesday",3:"Thursday",4:"Friday",5:"Saturday",6:"Sunday"}
        s = s.map(mapping)
    else:
        s = s.astype(str).str.strip().str.lower()
        alias = {
            "mon":"monday","monday":"monday","0":"monday",
            "tue":"tuesday","tuesday":"tuesday","1":"tuesday",
            "wed":"wednesday","wednesday":"wednesday","2":"wednesday",
            "thu":"thursday","thursday":"thursday","3":"thursday",
            "fri":"friday","friday":"friday","4":"friday",
            "sat":"saturday","saturday":"saturday","5":"saturday",
            "sun":"sunday","sunday":"sunday","6":"sunday",
        }
        s = s.map(lambda x: alias.get(x, x)).str.capitalize()

    df["publish_dayofweek"] = pd.Categorical(s, categories=WEEK_ORDER, ordered=True)

if "is_weekend" not in df.columns:
    if "publish_dayofweek" in df.columns:
        df["is_weekend"] = df["publish_dayofweek"].isin(["Saturday","Sunday"]).astype(int)
    else:
        df["is_weekend"] = 0

# --- safe ranges for sliders (robust to outliers / missing) -----------------
if "duration_sec" in df.columns and df["duration_sec"].notna().any():
    dur_min = int(np.nanquantile(df["duration_sec"], 0.01))
    dur_max = int(np.nanquantile(df["duration_sec"], 0.99))
    dur_min = max(0, dur_min)
    dur_max = min(180, max(dur_min + 1, dur_max))
else:
    dur_min, dur_max = 0, 90

hour_min, hour_max = 0, 23

# --- dropdown options --------------------------------------
country_opts = []
if "country" in df.columns:
    top_c = df["country"].value_counts().head(40).reset_index()
    top_c.columns = ["country", "n"]
    country_opts = [{"label": f"{r.country} (N={int(r.n):,})", "value": r.country} for _, r in top_c.iterrows()]

category_opts = []
if "category" in df.columns:
    top_cat = df["category"].value_counts().head(40).reset_index()
    top_cat.columns = ["category", "n"]
    category_opts = [{"label": f"{r.category} (N={int(r.n):,})", "value": r.category} for _, r in top_cat.iterrows()]

platform_opts = []
if "platform" in df.columns:
    platform_opts = [{"label": p, "value": p} for p in df["platform"].dropna().unique()]

print(
    f"Rows={len(df):,} | duration range ~ [{dur_min}, {dur_max}] | "
    f"countries={len(country_opts)} | categories={len(category_opts)} | platforms={len(platform_opts)}"
)


Using CSV: D:\Viz_Project\data\youtube_shorts_tiktok_trends_2025.csv
Rows=48,079 | duration range ~ [10, 89] | countries=30 | categories=19 | platforms=2


In [57]:
# --- Helper: apply filters --------------------------------------------------
def apply_filters(dfin, platforms, countries, categories, dur_range, hour_range):
    d = dfin
    if platforms:
        d = d[d['platform'].isin(platforms)]
    if countries:
        d = d[d['country'].isin(countries)]
    if categories:
        d = d[d['category'].isin(categories)]
    if 'duration_sec' in d.columns and dur_range:
        d = d[(d['duration_sec'] >= dur_range[0]) & (d['duration_sec'] <= dur_range[1])]
    if 'upload_hour' in d.columns and hour_range:
        d = d[(d['upload_hour'] >= hour_range[0]) & (d['upload_hour'] <= hour_range[1])]
    return d.copy()


In [58]:
# --- Figure builders with tidy axes/labels ---------------------------------
def fig_q1_1_platform_box(d):
    title = "Q1-1 Engagement Rate by Platform"
    n_by = d.groupby('platform', dropna=False)['engagement_rate'].size().to_dict()
    fig = px.box(d, x='platform', y='engagement_rate', color='platform', points=False, title=title)
    # annotate medians
    meds = d.groupby('platform')['engagement_rate'].median()
    for plat, val in meds.items():
        fig.add_annotation(x=plat, y=val, text=f"{val:.1%}", showarrow=False, yshift=10, font=dict(size=12))
    fig.update_yaxes(tickformat=".0%", nticks=6)
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50), legend_title_text="platform",
                      title=dict(text=f"Q1-1 Engagement Rate by Platform — "
                                    f"{', '.join([f'{k} N={v:,}' for k,v in n_by.items()])}"))
    return fig

def fig_q1_2_structure_box(d):
    cols = ['engagement_rate','engagement_like_rate','engagement_comment_rate','engagement_share_rate']
    m = d.melt(id_vars=['platform'], value_vars=[c for c in cols if c in d.columns],
               var_name='metric', value_name='value')
    fig = px.box(m, x='metric', y='value', color='platform', points=False,
                 category_orders={'metric': cols},
                 title='Q1-2 Engagement Structure by Platform')
    fig.update_yaxes(tickformat=".0%", nticks=6)
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=60))
    return fig

def fig_q1_3_views_likes(d):
    dd = d[(d['views'] > 0) & (d['likes'] > 0)].copy()
    if len(dd) > 12000:
        dd = dd.groupby('platform', group_keys=False).apply(lambda x: x.sample(min(6000, len(x)), random_state=42))

    fig = px.scatter(
        dd, x='views', y='likes', color='platform', opacity=0.55,
        title='Q1-3 Views vs Likes by Platform (log–log; slope annotated)'
    )

    # Build decade ticks explicitly to avoid the log+dtick text replication bug.
    vx_min = np.floor(np.log10(dd['views'].min()))
    vx_max = np.ceil(np.log10(dd['views'].max()))
    vy_min = np.floor(np.log10(dd['likes'].min()))
    vy_max = np.ceil(np.log10(dd['likes'].max()))
    x_tickvals = [10 ** e for e in range(int(vx_min), int(vx_max) + 1)]
    y_tickvals = [10 ** e for e in range(int(vy_min), int(vy_max) + 1)]
    def si(n):  # 1_000 -> '1k', 1_000_000 -> '1M'
        for u, t in [(1e12,'T'),(1e9,'B'),(1e6,'M'),(1e3,'k')]:
            if n >= u: return f"{int(n/u)}{t}"
        return str(int(n))

    fig.update_xaxes(type='log', tickmode='array', tickvals=x_tickvals, ticktext=[si(v) for v in x_tickvals])
    fig.update_yaxes(type='log', tickmode='array', tickvals=y_tickvals, ticktext=[si(v) for v in y_tickvals])

    # Per-platform slope on log10 scale
    txts = []
    for plat, g in dd.groupby('platform'):
        x = np.log10(g['views'])
        y = np.log10(g['likes'])
        if len(g) > 10:
            k = np.polyfit(x, y, 1)[0]
            txts.append(f"{plat}: slope≈{k:.4f}")

    if txts:
        fig.add_annotation(
            x=0, y=0, xref='paper', yref='paper',
            xanchor='left', yanchor='bottom',
            text=' | '.join(txts),
            showarrow=False,
            font=dict(size=12),
            bgcolor='rgba(255,255,255,0.6)',
            bordercolor='rgba(0,0,0,0.08)'
        )

    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50))
    return fig


def fig_q2_1_hour_week_heat(d):
    if 'publish_dayofweek' not in d.columns or 'upload_hour' not in d.columns:
        return go.Figure()
    heat = d.groupby(['publish_dayofweek','upload_hour'], as_index=False)['engagement_rate'].median()
    # Build pivot with full 24h x 7 days grid
    week_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    hours = list(range(0,24))
    grid = (heat.pivot(index='publish_dayofweek', columns='upload_hour', values='engagement_rate')
                .reindex(index=week_order, columns=hours))
    fig = px.imshow(grid, aspect='auto', color_continuous_scale='Viridis', origin='upper',
                    labels=dict(x='Hour', y='Weekday', color='Median'))
    fig.update_layout(title='Q2-1 Median Engagement — Hour × Weekday',
                      margin=dict(l=70, r=20, t=60, b=50))
    fig.update_xaxes(tickmode='array', tickvals=list(range(0,24,2)))
    fig.update_coloraxes(colorbar_tickformat='.1%')
    return fig

def fig_q2_2_weekend_box(d):
    dd = d.copy()
    dd['weekend'] = np.where(dd['is_weekend']==1, 'Weekend', 'Weekday')
    fig = px.box(dd, x='weekend', y='engagement_rate', points=False,
                 title='Q2-2 Engagement Rate — Weekend vs Weekday')
    med = dd.groupby('weekend')['engagement_rate'].median()
    for k,v in med.items():
        fig.add_annotation(x=k, y=v, text=f"{v:.1%}", showarrow=False, yshift=10)
    fig.update_yaxes(tickformat='.0%', nticks=6)
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50))
    return fig

def fig_q3_1_duration_completion(d):
    fig = px.density_heatmap(d, x='duration_sec', y='completion_rate', nbinsx=28, nbinsy=24,
                             color_continuous_scale='Viridis',
                             title='Q3-1 Duration vs Completion Rate (Density)')
    fig.update_xaxes(tickmode='linear', dtick=10)
    fig.update_yaxes(tickformat='.0%', nticks=8)
    fig.update_coloraxes(colorbar_title='Count')
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50))
    return fig

def fig_q3_2_duration_watchtime(d):
    dd = d.copy()
    if len(dd) > 12000:
        dd = dd.groupby('platform', group_keys=False).apply(lambda x: x.sample(min(6000, len(x)), random_state=42))
    fig = px.scatter(dd, x='duration_sec', y='avg_watch_time_sec', color='platform', opacity=0.55,
                     title='Q3-2 Duration vs Avg Watch Time (by Platform)')
    fig.update_xaxes(tickmode='linear', dtick=10)
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50))
    return fig

def fig_q4_1_top_categories(d, topn=20):
    if 'category' not in d.columns: return go.Figure()
    g = (d.groupby('category')
           .agg(median_eng=('engagement_rate','median'), N=('engagement_rate','size'))
           .reset_index())
    g = g.sort_values('median_eng', ascending=False).head(topn)
    g['label'] = g['category'] + g['N'].map(lambda x: f" (N={x:,})")
    fig = px.bar(g, y='label', x='median_eng', orientation='h', title='Q4-1 Top-20 Categories — Median Engagement (sorted by median; N = sample size)')
    fig.update_xaxes(tickformat='.0%', nticks=6)
    fig.update_layout(margin=dict(l=150, r=20, t=60, b=40))
    return fig

def fig_q4_2_top_hashtags(d, topn=30):
    if 'hashtag' not in d.columns: return go.Figure()
    g = (d.groupby('hashtag')
           .agg(median_share=('engagement_share_rate','median'), N=('engagement_share_rate','size'))
           .reset_index())
    g = g.sort_values('median_share', ascending=False).head(topn)
    g['label'] = g['hashtag'] + g['N'].map(lambda x: f"  (N={x:,})")
    fig = px.bar(g, y='label', x='median_share', orientation='h', title='Q4-2 Top-30 Hashtags — Median Share Rate (sorted by median; N = sample size)')
    fig.update_xaxes(tickformat='.1%', nticks=6)
    fig.update_layout(margin=dict(l=220, r=20, t=60, b=40))
    return fig

def fig_q5_1_creator_tier(d):
    if 'creator_tier' not in d.columns: return go.Figure()
    fig = px.box(d, x='creator_tier', y='engagement_rate', points=False, title='Q5-1 Engagement Rate by Creator Tier')
    med = d.groupby('creator_tier')['engagement_rate'].median()
    for k,v in med.items():
        fig.add_annotation(x=k, y=v, text=f"{v:.1%}", showarrow=False, yshift=10)
    fig.update_yaxes(tickformat='.0%', nticks=6)
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50))
    return fig

def fig_q5_2_trend_velocity(d):
    cols_exist = set(['trend_duration_days','engagement_velocity','views']).issubset(d.columns)
    if not cols_exist: return go.Figure()
    dd = d.copy()
    if len(dd) > 12000:
        dd = dd.groupby('platform', group_keys=False).apply(lambda x: x.sample(min(6000, len(x)), random_state=42))
    fig = px.scatter(dd, x='trend_duration_days', y='engagement_velocity', color='platform', size='views',
                     size_max=18, opacity=0.6,
                     title='Q5-2 Trend Duration vs Engagement Velocity (log y; size=views)')
    fig.update_yaxes(type='log', tickformat='~s', dtick=1)
    fig.update_layout(margin=dict(l=70, r=20, t=60, b=50))
    return fig




In [59]:
# --- Build Dash layout ------------------------------------------------------
app = dash.Dash(__name__)
app.title = "YouTube & TikTok Trends — EDA"

controls = html.Div([
    html.H1("YouTube & TikTok Trends — Interactive EDA Dashboard"),
    html.P("Data source: YouTube Shorts & TikTok Trends 2025 (local CSV)", className='muted'),

    html.Div([
        html.Div([
            html.Label("Platform"),
            dcc.Dropdown(id='platform-dd', options=platform_opts,
                         value=[o['value'] for o in platform_opts], multi=True)
        ], style={"flex":"1","minWidth":"240px","marginRight":"12px"}),
        html.Div([
            html.Label("Country (top options)"),
            dcc.Dropdown(id='country-dd', options=country_opts, multi=True, placeholder="Select...")
        ], style={"flex":"1","minWidth":"260px","marginRight":"12px"}),
        html.Div([
            html.Label("Category (optional)"),
            dcc.Dropdown(id='category-dd', options=category_opts, multi=True, placeholder="Select...")
        ], style={"flex":"1","minWidth":"260px"}),
    ], style={"display":"flex","flexWrap":"wrap","marginBottom":"8px"}),

    html.Div([
        html.Div([
            html.Label("Duration (sec)"),
            dcc.RangeSlider(id='duration-slider', min=dur_min, max=dur_max, step=1,
                            value=[dur_min, dur_max],
                            marks=make_sparse_marks(dur_min, dur_max, step=10, fmt=lambda v: f"{v}"),
                            allowCross=False, dots=False,
                            tooltip={"placement":"bottom","always_visible":False}),
        ], style={"flex":"1","minWidth":"420px","marginRight":"12px"}),

        html.Div([
            html.Label("Upload hour"),
            dcc.RangeSlider(id='hour-slider', min=0, max=23, step=1, value=[0,23],
                            marks={h:f"{h}" for h in range(0,24,3)}, allowCross=False, dots=False,
                            tooltip={"placement":"bottom","always_visible":False}),
        ], style={"flex":"1","minWidth":"420px"}),
    ], style={"display":"flex","flexWrap":"wrap","marginBottom":"12px"}),
], style={"marginBottom":"8px"})

kpi_row = html.Div([
    html.Div([html.Div("Rows"), html.H2(id='kpi-rows'), html.Div("After filters")]),
    html.Div([html.Div("Median engagement"), html.H2(id='kpi-med-eng'), html.Div("")]),
    html.Div([html.Div("Median duration"), html.H2(id='kpi-med-dur'), html.Div("")]),
    html.Div([html.Div("Median completion"), html.H2(id='kpi-med-comp'), html.Div("")]),
], id='kpi-row', style={"display":"grid","gridTemplateColumns":"repeat(4, 1fr)","gap":"14px","marginBottom":"12px"})

tabs = dcc.Tabs(id='tabs-container', value='q1', children=[
    dcc.Tab(label='Q1 Platform Differences', value='q1', className='tab', selected_className='tab--selected', children=[
        html.Div([
            dcc.Graph(id='fig-q1-1', className='dash-graph'),
            dcc.Graph(id='fig-q1-2', className='dash-graph'),
            dcc.Graph(id='fig-q1-3', className='dash-graph'),
        ])
    ]),
    dcc.Tab(label='Q2 When to Post', value='q2', className='tab', selected_className='tab--selected', children=[
        html.Div([
            dcc.Graph(id='fig-q2-1', className='dash-graph'),
            dcc.Graph(id='fig-q2-2', className='dash-graph'),
        ])
    ]),
    dcc.Tab(label='Q3 Length & Retention', value='q3', className='tab', selected_className='tab--selected', children=[
        html.Div([
            dcc.Graph(id='fig-q3-1', className='dash-graph'),
            dcc.Graph(id='fig-q3-2', className='dash-graph'),
        ])
    ]),
    dcc.Tab(label='Q4 Topics & Hashtags', value='q4', className='tab', selected_className='tab--selected', children=[
        html.Div([
            dcc.Graph(id='fig-q4-1', className='dash-graph'),
            dcc.Graph(id='fig-q4-2', className='dash-graph'),
        ])
    ]),
    dcc.Tab(label='Q5 Creators & Trend Cycle', value='q5', className='tab', selected_className='tab--selected', children=[
        html.Div([
            dcc.Graph(id='fig-q5-1', className='dash-graph'),
            dcc.Graph(id='fig-q5-2', className='dash-graph'),
        ])
    ]),
])


viz_wrap = html.Div(
    [kpi_row, tabs],
    id='viz-wrap'
)

app.layout = html.Div([
    controls,
    viz_wrap
], style={"maxWidth":"1200px","margin":"18px auto"})


In [60]:
# --- Callbacks --------------------------------------------------------------
@app.callback(
    [Output('kpi-rows','children'), Output('kpi-med-eng','children'), Output('kpi-med-dur','children'), Output('kpi-med-comp','children'),
     Output('fig-q1-1','figure'), Output('fig-q1-2','figure'), Output('fig-q1-3','figure'),
     Output('fig-q2-1','figure'), Output('fig-q2-2','figure'),
     Output('fig-q3-1','figure'), Output('fig-q3-2','figure'),
     Output('fig-q4-1','figure'), Output('fig-q4-2','figure'),
     Output('fig-q5-1','figure'), Output('fig-q5-2','figure')],
    [Input('platform-dd','value'), Input('country-dd','value'), Input('category-dd','value'),
     Input('duration-slider','value'), Input('hour-slider','value')]
)
def update_all(platforms, countries, categories, dur_rng, hour_rng):
    d = apply_filters(df, platforms, countries, categories, dur_rng, hour_rng)
    # KPIs
    rows = f"{len(d):,.0f}"
    med_eng = f"{d['engagement_rate'].median():.1%}" if 'engagement_rate' in d else "—"
    med_dur = f"{d['duration_sec'].median():.0f}s" if 'duration_sec' in d else "—"
    med_comp = f"{d['completion_rate'].median():.1%}" if 'completion_rate' in d else "—"
    # figs
    f_q1_1 = fig_q1_1_platform_box(d)
    f_q1_2 = fig_q1_2_structure_box(d)
    f_q1_3 = fig_q1_3_views_likes(d)
    f_q2_1 = fig_q2_1_hour_week_heat(d)
    f_q2_2 = fig_q2_2_weekend_box(d)
    f_q3_1 = fig_q3_1_duration_completion(d)
    f_q3_2 = fig_q3_2_duration_watchtime(d)
    f_q4_1 = fig_q4_1_top_categories(d)
    f_q4_2 = fig_q4_2_top_hashtags(d)
    f_q5_1 = fig_q5_1_creator_tier(d)
    f_q5_2 = fig_q5_2_trend_velocity(d)
    return rows, med_eng, med_dur, med_comp, f_q1_1, f_q1_2, f_q1_3, f_q2_1, f_q2_2, f_q3_1, f_q3_2, f_q4_1, f_q4_2, f_q5_1, f_q5_2


@app.callback(
    Output('viz-wrap', 'style'),
    Input('platform-dd', 'value')
)
def toggle_viz_wrap(platforms):
    if not platforms:
        return {'display': 'none'}
    return {}

    
@app.callback(
    Output('country-dd', 'disabled'),
    Output('category-dd', 'disabled'),
    Output('duration-slider', 'disabled'),
    Output('hour-slider', 'disabled'),
    Output('country-dd', 'value'),
    Output('category-dd', 'value'),
    Output('country-dd', 'placeholder'),
    Output('category-dd', 'placeholder'),
    Input('platform-dd', 'value'),
    prevent_initial_call=True
)
def disable_and_hint(platforms):
    if not platforms:
        msg = "Please select platform(s) first"
        return (
            True, True, True, True,  
            None, None,              
            msg, msg                  
        )
    return (
        False, False, False, False,
        dash.no_update, dash.no_update,
        "Select...", "Select..."
    )

In [61]:
# --- Run server (uncomment to run inside notebook) -------------------------
app.run_server(debug=True)
print("Dash app defined. Run `app.run_server()` in a separate cell to start.")


Dash app defined. Run `app.run_server()` in a separate cell to start.


































































