In [None]:
import re
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from lifelines import KaplanMeierFitter
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.stats import chi2_contingency, ttest_ind, f_oneway, kruskal
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
import networkx as nx
import calendar
import plotly.io as pio

# Use iframe renderer, which works reliably in Quarto HTML
pio.renderers.default = "iframe"


# Get the path to the repo root (assuming notebook is in root or subfolder)
notebook_dir = Path().resolve()  # current working directory
# Go up one level to repo root
repo_root = notebook_dir.parent

In [3]:
# Load data
# Path to CSV
csv_path = repo_root / "data" / "billboard_data_2025_09.csv"

# Load the CSV
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Top Ten Entry Date,Single Name,Artist(s),Peak,Peak Date,Weeks in Top Ten,Ref,Year
0,1960-12-12,Wonderland by Night,Bert Kaempfert,1,1961-01-09,10,,1960
1,1960-12-12,Exodus,Ferrante & Teicher,2,1961-01-23,11,,1960
2,1960-12-26,"Corrina, Corinna",Ray Peterson,9,1961-01-09,5,,1960
3,1960-12-31,Angel Baby,Rosie and the Originals,5,1961-01-23,7,,1960
4,1961-01-09,Will You Love Me Tomorrow,The Shirelles,1,1961-01-30,7,,1961


In [4]:
#some preprocessing
df['Top Ten Entry Date'] = pd.to_datetime(df['Top Ten Entry Date'], errors='coerce')
df['Peak Date'] = pd.to_datetime(df['Peak Date'], errors='coerce')
# Create a decade column
df['Decade'] = (df['Top Ten Entry Date'].dt.year // 10) * 10

In [137]:
# Questions I want answers to:
# 1. What are the top 10 genres with the most number of songs in the dataset?
# 2. Average weeks in top ten by decade
# 3. Average weeks in top ten for peak 1 positions vs peak 10 positions
# 4. item 3 but by decade 

In [138]:
print(f"The overall average number of a weeks a song spends in the top 10 is {df['Weeks in Top Ten'].mean()} weeks")
fig = px.bar(
    df['Weeks in Top Ten'].value_counts().sort_index().reset_index(),
    x='Weeks in Top Ten',
    y='count',
    text_auto=True,
    title="Overall Distribution of Weeks in Top Ten (Exact Weeks)"
)

fig.update_layout(
    xaxis_title="Weeks in Top Ten",
    yaxis_title="Number of Songs"
)

fig.show()

# Aggregate: average weeks in Top Ten by decade
decade_stats = df.groupby('Decade', as_index=False)['Weeks in Top Ten'].mean()

# Plot with Plotly
fig = px.bar(
    decade_stats,
    x='Decade',
    y='Weeks in Top Ten',
    text_auto='.2f',
    title="Average Weeks in Top Ten by Decade"
)

# Make labels clearer
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(
    xaxis_title="Decade",
    yaxis_title="Average Weeks in Top Ten"
)

fig.show()



#Per Year
trend = df.groupby("Year")["Weeks in Top Ten"].mean().reset_index()

# Plot with Plotly Express
fig = px.line(
    trend,
    x="Year",
    y="Weeks in Top Ten",
    title="Average Weeks in Top Ten Over Time Per Year",
    labels={"Weeks in Top Ten": "Avg Weeks", "Year": "Year"}
)

fig.show()

The overall average number of a weeks a song spends in the top 10 is 6.641318848866018 weeks


In [139]:
# Distribution of Peak Positions
fig = px.bar(
    df['Peak'].value_counts().sort_index().reset_index(),
    x='Peak',
    y='count',
    text_auto=True,
    title="Overall Distribution of Peak Positions"
)

fig.update_layout(
    xaxis_title="Peak Position",
    yaxis_title="Number of Songs",
    xaxis=dict(tickmode='linear', dtick=1)  # <- force every tick to show
)

fig.show()

# Aggregate: average weeks in Top Ten by decade
decade_stats = df.groupby('Decade', as_index=False)['Peak'].mean()

# Plot with Plotly
fig = px.bar(
    decade_stats,
    x='Decade',
    y='Peak',
    text_auto='.2f',
    title="Average Peak Position in Top Ten by Decade"
)

# Make labels clearer
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(
    xaxis_title="Decade",
    yaxis_title="Average Peak Position in Top Ten"
)

fig.show()


#Peak Position Per Year
trend = df.groupby("Year")["Peak"].mean().reset_index()

# Plot with Plotly Express
fig = px.line(
    trend,
    x="Year",
    y="Peak",
    title="Average Peak Position in Top Ten Over Time Per Year",
    labels={"Peak": "Avg Peak Position", "Year": "Year"}
)

fig.show()

In [140]:
# Fit KM survival curve
kmf = KaplanMeierFitter()
kmf.fit(df["Weeks in Top Ten"], event_observed=[1]*len(df))  # all songs eventually "fail"

# Get survival function as dataframe
survival_df = kmf.survival_function_.reset_index()
survival_df.columns = ["Weeks in Top Ten", "Probability of Staying"]

# Plot with Plotly
fig = px.line(
    survival_df,
    x="Weeks in Top Ten",
    y="Probability of Staying",
    title="Survival Curve of Billboard Top Ten Singles",
    labels={
        "Weeks in Top Ten": "Weeks in Top Ten",
        "Probability of Staying": "Probability of Staying in Top Ten"
    }
)

# Style adjustments
fig.update_traces(mode="lines+markers")
fig.update_layout(yaxis=dict(range=[0, 1]))

fig.show()

In [141]:
#Survival analysis by decade
# Clean figure
fig = go.Figure()
kmf = KaplanMeierFitter()

# Only unique decades
decades = sorted(df['Decade'].dropna().unique())

for i, decade in enumerate(decades):
    group = df[df['Decade'] == decade]
    if len(group) == 0:
        continue

    # Fit Kaplan-Meier curve
    kmf.fit(group["Weeks in Top Ten"], event_observed=[1]*len(group))
    surv = kmf.survival_function_.reset_index()
    surv.columns = ["Weeks in Top Ten", "Probability of Staying"]

    # Add survival curve
    fig.add_trace(
        go.Scatter(
            x=surv["Weeks in Top Ten"],
            y=surv["Probability of Staying"],
            mode="lines",
            name=str(decade),
            line=dict(width=3)
        )
    )

    # Automatically pick evenly spaced points along the middle 80% of the curve
    n_labels = 1
    start_idx = int(len(surv) * 0.3)   # skip first 10%
    end_idx = int(len(surv) * 0.95)     # skip last 10%
    indices = np.linspace(start_idx, end_idx, n_labels, dtype=int)

    for j, idx in enumerate(indices):
        # stagger labels vertically to reduce overlap
        y_offset = 15 * (i - len(decades)/2) + 10 * (j - (n_labels-1)/2)

        # Add annotation once per point
        fig.add_annotation(
            x=surv["Weeks in Top Ten"].iloc[idx],
            y=surv["Probability of Staying"].iloc[idx],
            text=f"{decade}",
            showarrow=True,
            arrowhead=2,
            font=dict(size=12),
            ax=0,
            ay=-y_offset
        )

# Layout
fig.update_layout(
    title="Survival Curves of Billboard Top Ten Singles by Decade",
    xaxis_title="Weeks in Top Ten",
    yaxis_title="Probability of Staying in Top Ten",
    yaxis=dict(range=[0, 1]),
    showlegend=False
)

fig.show()


In [142]:
# Clean figure
fig = go.Figure()
kmf = KaplanMeierFitter()

# Only unique decades
decades = sorted(df['Decade'].dropna().unique())

for i, decade in enumerate(decades):
    group = df[df['Decade'] == decade]
    if len(group) == 0:
        continue

    # Fit Kaplan-Meier curve
    kmf.fit(group["Weeks in Top Ten"], event_observed=[1]*len(group))
    surv = kmf.survival_function_.reset_index()
    surv.columns = ["Weeks in Top Ten", "Probability of Staying"]

    # Add survival curve
    fig.add_trace(
        go.Scatter(
            x=surv["Weeks in Top Ten"],
            y=surv["Probability of Staying"],
            mode="lines",
            name=str(decade),
            line=dict(width=3)
        )
    )

    # Place label at 80% of curve
    idx = int(len(surv) * 0.45)

    # stagger labels vertically
    y_offset = 15 * (i - len(decades)/2)

    # Add annotation with custom font
    fig.add_annotation(
        x=surv["Weeks in Top Ten"].iloc[idx],
        y=surv["Probability of Staying"].iloc[idx],
        text=f"{decade}",
        showarrow=True,
        arrowhead=3,
        font=dict(family="Arial", size=13, color="black"),  # base font
        xanchor="left",
        yanchor="middle"
    )

# Layout
fig.update_layout(
    title="Survival Curves of Billboard Top Ten Singles by Decade",
    xaxis_title="Weeks in Top Ten",
    yaxis_title="Probability of Staying in Top Ten",
    yaxis=dict(range=[0, 1], tickvals=np.arange(0.1, 1.1, 0.1), tickformat=".0%"),  # remove 0
    showlegend=False
)

fig.show()


In [143]:
# Average weeks in Top Ten by peak position
longevity_by_peak = df.groupby("Peak")["Weeks in Top Ten"].mean().reset_index()

# Sort by peak position
longevity_by_peak = longevity_by_peak.sort_values("Peak")

# Plot with Plotly
fig = px.bar(
    longevity_by_peak,
    x="Peak",
    y="Weeks in Top Ten",
    text="Weeks in Top Ten",
    labels={"Peak": "Peak Position", "Weeks in Top Ten": "Avg Weeks"},
    title="Average Weeks in Top Ten by Peak Position"
)

# Show exact values on bars
fig.update_traces(texttemplate="%{text:.2f}", textposition="outside")

# Optional: style layout
fig.update_layout(
    yaxis_title="Avg Weeks",
    xaxis_title="Peak Position",
    uniformtext_minsize=8,
    uniformtext_mode='hide'
)

fig.show()

In [144]:
# Number of entries per year
entries_per_year = df.groupby("Year")["Single Name"].count().reset_index()
entries_per_year.rename(columns={"Single Name": "Count"}, inplace=True)

# Plot with Plotly Express
fig = px.line(
    entries_per_year,
    x="Year",
    y="Count",
    title="Number of Top Ten Entries per Year",
    labels={"Year": "Year", "Count": "Number of Entries"}
)

fig.show()

In [145]:
# Top 10 artists by number of Top Ten entries
top_artists = df["Artist(s)"].value_counts().head(10).reset_index()
top_artists.columns = ["Artist", "Entries"]

# Plotly horizontal bar chart
fig = px.bar(
    top_artists,
    x="Entries",
    y="Artist",
    orientation="h",
    text="Entries",
    title="Top Artists by Top Ten Entries"
)

# Show exact numbers on bars
fig.update_traces(texttemplate="%{text}", textposition="outside")

# Optional layout adjustments
fig.update_layout(
    xaxis_title="Number of Entries",
    yaxis_title="Artist",
    yaxis=dict(autorange="reversed")  # keep highest at top
)

fig.show()

# One-hit wonders
artist_song_counts = df.groupby("Artist(s)")["Single Name"].nunique()
one_hit_pct = (artist_song_counts == 1).mean() * 100
print(f"One-hit wonders (Artists that reach the top 10 only once): {one_hit_pct:.1f}% of artists")

One-hit wonders (Artists that reach the top 10 only once): 70.2% of artists


In [5]:
# Calculate total weeks in Top Ten for each artist
artist_real_estate = df.groupby("Artist(s)")["Weeks in Top Ten"].sum().reset_index()
artist_real_estate = artist_real_estate.sort_values("Weeks in Top Ten", ascending=False).head(10)

# Plot horizontal bar chart
fig = px.bar(
    artist_real_estate,
    x="Weeks in Top Ten",
    y="Artist(s)",
    orientation="h",
    text="Weeks in Top Ten",
    title="Top 10 Artists by Total Weeks in Top Ten (Chart Real Estate)"
)

fig.update_traces(textposition="outside")
fig.update_layout(
    xaxis_title="Total Weeks in Top Ten",
    yaxis_title="Artist",
    yaxis=dict(autorange="reversed")
)

fig.show()

In [146]:
#Seasonal Top Ten Entries by Month and Decade

# Ensure Decade and Month columns exist
df['Decade'] = (df['Top Ten Entry Date'].dt.year // 10) * 10
df['Month'] = df['Top Ten Entry Date'].dt.month

# Count entries by month and decade
entries_by_month_decade = df.groupby(['Month', 'Decade'])['Single Name'].count().reset_index()
entries_by_month_decade.rename(columns={"Single Name": "Entries"}, inplace=True)

# Convert numeric months to abbreviated month names
entries_by_month_decade['Month Name'] = entries_by_month_decade['Month'].apply(lambda x: calendar.month_abbr[x])
fig=()
# Plot grouped bar chart
fig = px.bar(
    entries_by_month_decade,
    x='Month Name',
    y='Entries',
    color='Decade',
    barmode='group',  # grouped bars
    text='Entries',
    title='Top Ten Entries by Month and Decade',
    height=700
)

# Show numerical labels inside bars
fig.update_traces(textposition='inside')

# Improve layout
fig.update_layout(
    xaxis_title="Month",
    yaxis_title="Number of Entries",
    uniformtext_minsize=8,
    uniformtext_mode='hide'
)

fig.show()


In [147]:
#Song Trajectory (Entry - Peak Lag)

# Compute Lag to Peak
df["Lag to Peak"] = (df["Peak Date"] - df["Top Ten Entry Date"]).dt.days

# Drop NaNs for plotting
lag_data = df["Lag to Peak"].dropna()

# Create histogram
fig = px.histogram(
    lag_data,
    nbins=30,
    title="Distribution of Days to Peak Position",
    labels={"value": "Days", "count": "Count"}
)

# Optional: show counts on hover
fig.update_traces(marker_color='blue', opacity=0.8)

# Adjust layout
fig.update_layout(
    xaxis_title="Days",
    yaxis_title="Count",
    bargap=0.05,
    height=600
)

fig.show()



# Get top 10 songs with greatest lag
top_lag = df.nlargest(10, "Lag to Peak")[["Single Name", "Artist(s)", "Lag to Peak", "Top Ten Entry Date", "Peak Date"]]
top_lag['Label'] = top_lag["Single Name"] + " – " + top_lag["Artist(s)"]

# Horizontal bar chart with hover info
fig_top_lag = px.bar(
    top_lag,
    x="Lag to Peak",
    y="Label",
    orientation='h',
    text="Lag to Peak",
    hover_data={
        "Lag to Peak": True,
        "Top Ten Entry Date": True,
        "Peak Date": True,
        "Label": False  # hide duplicate label in hover
    },
    title="Top 10 Songs with Greatest Lag to Peak"
)

# Show numbers outside bars
fig_top_lag.update_traces(textposition="outside")

# Keep the largest lag at top
fig_top_lag.update_layout(
    yaxis=dict(autorange="reversed"),
    height=800,
    xaxis_title="Days to Peak",
    yaxis_title=""
)

fig_top_lag.show()

In [148]:

# Longest-running songs
longest = df.nlargest(20, "Weeks in Top Ten")[["Single Name", "Artist(s)", "Weeks in Top Ten"]]
longest['Label'] = longest["Single Name"] + " – " + longest["Artist(s)"]

fig_longest = px.bar(
    longest,
    x="Weeks in Top Ten",
    y="Label",
    orientation='h',
    text="Weeks in Top Ten",
    title="Top 20 Longest-Running Top Ten Songs"
)

fig_longest.update_traces(textposition="outside")
fig_longest.update_layout(
    height=800,
    yaxis=dict(
        automargin=True,
        autorange="reversed"))  # longest at top
fig_longest.show()


In [149]:
def clustering_plotly(df, k=4):
    print("=== Clustering Song Trajectories ===")
    
    # Prepare features
    features = df[["Lag to Peak", "Weeks in Top Ten", "Peak"]].dropna()
    
    # K-means clustering
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(features)
    df.loc[features.index, "Cluster"] = km.labels_.astype(str)  # convert to string for coloring
    
    # Scatter plot of K-means clusters
    fig_scatter = px.scatter(
        df.loc[features.index],
        x="Lag to Peak",
        y="Weeks in Top Ten",
        color="Cluster",
        hover_data=["Peak", "Lag to Peak", "Weeks in Top Ten", "Single Name", "Artist(s)"],
        title="K-means Clustering of Songs",
        height=600
    )
    fig_scatter.show()
    
    # Hierarchical clustering (subset for performance)
    subset = features.sample(min(150, len(features)), random_state=42)
    Z = linkage(subset, method="ward")
    
    # Convert linkage to dendrogram in Plotly
    # fig_dendro = ff.create_dendrogram(subset, orientation='top', labels=subset.index.astype(str))
    # fig_dendro.update_layout(width=1000, height=500, title="Hierarchical Clustering Dendrogram (subset)")
    # fig_dendro.show()

clustering_plotly(df, k=4)

=== Clustering Song Trajectories ===


In [150]:
# Looks like my sleeper hits (long lag but long weeks in top ten) are:
cluster_2_songs = df[df["Cluster"] == "2"][["Single Name", "Artist(s)", "Weeks in Top Ten", "Lag to Peak", "Peak"]]
cluster_2_songs_sorted = cluster_2_songs.sort_values(by=["Peak", "Weeks in Top Ten"], ascending=[True, False])

# Create label combining song and artist
cluster_2_songs_sorted['Label'] = cluster_2_songs_sorted["Single Name"] + " – " + cluster_2_songs_sorted["Artist(s)"]

# Horizontal bar chart
fig_cluster2 = px.bar(
    cluster_2_songs_sorted,
    x="Weeks in Top Ten",
    y="Label",
    orientation='h',
    text="Weeks in Top Ten",
    hover_data=["Peak", "Lag to Peak"],
    title="Sleeper Hits (Songs that Peaked Late but Stayed Long)",
    height=800
)

# Show numbers outside bars
fig_cluster2.update_traces(textposition="outside")

# Keep the highest peaks at the top
fig_cluster2.update_layout(
    yaxis=dict(autorange="reversed", automargin=True),
    xaxis_title="Weeks in Top Ten",
    yaxis_title="Single – Artist",
    height = 1000
)

fig_cluster2.show()

In [151]:
#Artist Collaboration Networks by Decade
frames = []
decade_list = sorted(df['Decade'].dropna().unique())

# Prepare a graph for each decade
for decade in decade_list:
    group = df[df['Decade'] == decade]
    G = nx.Graph()
    for artists in group["Artist(s)"].dropna():
        cleaned = re.sub(
            r'\s*(?:&|\bfeat\.?\b|\bfeaturing\b|,|\band\b|\&)\s*',
            ',',
            str(artists),
            flags=re.IGNORECASE
        )
        names = [a.strip() for a in cleaned.split(',') if a.strip()]
        for i in range(len(names)):
            for j in range(i+1, len(names)):
                G.add_edge(names[i], names[j])

    # Degree centrality
    deg = nx.degree_centrality(G)
    top_artists = sorted(deg.items(), key=lambda x: x[1], reverse=True)[:10]
    subG = G.subgraph(dict(top_artists).keys())
    pos = nx.spring_layout(subG, seed=42)

    # Edge traces
    edge_x, edge_y = [], []
    for edge in subG.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='#888'),
        hoverinfo='none',
        mode='lines'
    )

    # Node traces
    node_x, node_y, node_text, node_size = [], [], [], []
    for node in subG.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_text.append(node)
        node_size.append(deg[node]*2000)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hovertext=node_text,
        text=node_text,
        textposition='top center',
        marker=dict(size=node_size, color='lightblue'),
        showlegend=False
    )

    frames.append(go.Frame(data=[edge_trace, node_trace], name=str(decade)))

# Slider steps
slider_steps = [
    dict(method="animate",
         args=[[str(decade)],
               {"frame": {"duration": 0, "redraw": True},
                "mode": "immediate"}],
         label=str(decade))
    for decade in decade_list
]

# Initial figure
fig = go.Figure(
    data=frames[0].data,
    layout=go.Layout(
        title="Top Artist Collaboration Network by Decade",
        width=1000,   # increase figure width
        height=1000,   # increase figure height
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        showlegend=False,
        sliders=[dict(
            active=0,
            currentvalue={"prefix": "Decade: "},
            pad={"t": 50},
            steps=slider_steps
        )]
    ),
    frames=frames
)

fig.show()
