In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm


In [12]:
df1 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_1_v2.csv")
df2 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_2_v2.csv")
df3 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_3_v2.csv")
df4 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_3_v2.csv")
df = pd.concat([df1, df2, df3,df4], axis=0)
print(df.shape)
print(df.columns)
df.describe()
# only keep year larger than 2015
df = df[pd.to_datetime(df["published_date"], errors='coerce').dt.year >= 2016]




(7145, 18)
Index(['plain_text', 'published_date', 'title', 'tags', 'categories', 'author',
       'sitename', 'publisher', 'keyword_score', 'ai_score', 'is_ai',
       'orig_index', 'is_ai_llm', 'ai_relevance_llm', 'ai_topic_llm',
       'ai_barrier_llm', 'ai_barrier_type_llm', 'ai_barrier_summary_llm'],
      dtype='object')


In [13]:
import os
import pandas as pd
import numpy as np
import plotly.express as px

# =====================================================
# 0. Assume df is already loaded & concatenated
# =====================================================
# You said you already did:
# df1 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_1_v2.csv")
# df2 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_2_v2.csv")
# df3 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_3_v2.csv")
# df4 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_4_v2.csv")  # <- small fix: shard_4, not 3
# df = pd.concat([df1, df2, df3, df4], axis=0, ignore_index=True)

# If not, uncomment and fix paths:
# df1 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_1_v2.csv")
# df2 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_2_v2.csv")
# df3 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_3_v2.csv")
# df4 =  pd.read_csv("results/dk_news_2016_2024_ai_shard_4_v2.csv")
# df = pd.concat([df1, df2, df3, df4], axis=0, ignore_index=True)

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Directory for figures
FIG_DIR = "figures_ai_llm_overview"
os.makedirs(FIG_DIR, exist_ok=True)

# =====================================================
# 1. Basic cleaning & helpers
# =====================================================

# 1.1 Parse dates and extract year
if "published_date" in df.columns:
    df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
    df["year"] = df["published_date"].dt.year
else:
    df["year"] = np.nan

# 1.2 Convert LLM AI flag to boolean
def to_bool_series(series: pd.Series) -> pd.Series:
    """
    Robust boolean conversion:
    - true:  True, 1, "1", "true", "True", "ja"
    - false: False, 0, "0", "false", "False", "nej"
    - others: NaN
    """
    s = series.astype(str).str.strip().str.lower()
    true_vals = {"1", "true", "ja", "yes"}
    false_vals = {"0", "false", "nej", "no"}

    def _map_val(v: str):
        if v in true_vals:
            return True
        if v in false_vals:
            return False
        return np.nan

    return s.map(_map_val)

if "is_ai_llm" in df.columns:
    df["is_ai_llm_flag"] = to_bool_series(df["is_ai_llm"])
else:
    df["is_ai_llm_flag"] = np.nan

if "is_ai" in df.columns:
    df["is_ai_flag"] = to_bool_series(df["is_ai"])
else:
    df["is_ai_flag"] = np.nan

# 1.3 Ensure relevance score is numeric
if "ai_relevance_llm" in df.columns:
    df["ai_relevance_llm"] = pd.to_numeric(df["ai_relevance_llm"], errors="coerce")

# 1.4 Ensure barrier flag is boolean
if "ai_barrier_llm" in df.columns:
    df["ai_barrier_llm_flag"] = to_bool_series(df["ai_barrier_llm"])
else:
    df["ai_barrier_llm_flag"] = np.nan

# =====================================================
# 2. Basic descriptive statistics (printed)
# =====================================================

print("\n=== Basic info ===")
print(df.info())

print("\n=== Missing value ratio per column ===")
missing_ratio = df.isna().mean().sort_values(ascending=False)
print(missing_ratio)

print("\n=== Numeric describe ===")
print(df.describe())

print("\n=== Example topic / barrier columns head ===")
cols_preview = [
    "orig_index", "published_date", "sitename", "publisher",
    "is_ai_llm_flag", "ai_relevance_llm", "ai_topic_llm",
    "ai_barrier_llm_flag", "ai_barrier_type_llm"
]
cols_preview = [c for c in cols_preview if c in df.columns]
print(df[cols_preview].head(10))

# =====================================================
# 3. Distribution of AI relevance scores
# =====================================================

if "ai_relevance_llm" in df.columns:
    fig = px.histogram(
        df,
        x="ai_relevance_llm",
        nbins=50,
        title="Distribution of AI Relevance (LLM Score 0–100)",
    )
    fig.update_layout(
        xaxis_title="AI Relevance (LLM, 0–100)",
        yaxis_title="Number of Articles",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "01_ai_relevance_distribution.html"))
    # If kaleido is installed, you can also save PNG:
    # fig.write_image(os.path.join(FIG_DIR, "01_ai_relevance_distribution.png"))

# =====================================================
# 4. Relationship: ai_score / keyword_score vs LLM relevance
# =====================================================

MAX_POINTS_SCATTER = 50_000

scatter_df = df[["ai_relevance_llm", "ai_score", "keyword_score"]].copy()
scatter_df = scatter_df.dropna()
if len(scatter_df) > MAX_POINTS_SCATTER:
    scatter_df = scatter_df.sample(MAX_POINTS_SCATTER, random_state=42)

# 4.1 ai_score vs ai_relevance_llm
if "ai_score" in df.columns:
    fig = px.scatter(
        scatter_df,
        x="ai_score",
        y="ai_relevance_llm",
        title="SBERT AI Score vs LLM AI Relevance",
        opacity=0.4,
    )
    fig.update_layout(
        xaxis_title="ai_score (SBERT-based)",
        yaxis_title="ai_relevance_llm (0–100)",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "02_sbert_vs_llm_relevance.html"))

# 4.2 keyword_score vs ai_relevance_llm
if "keyword_score" in df.columns:
    fig = px.scatter(
        scatter_df,
        x="keyword_score",
        y="ai_relevance_llm",
        title="Keyword Score vs LLM AI Relevance",
        opacity=0.4,
    )
    fig.update_layout(
        xaxis_title="keyword_score",
        yaxis_title="ai_relevance_llm (0–100)",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "03_keyword_vs_llm_relevance.html"))

# =====================================================
# 5. Yearly trends: total vs AI vs AI share
# =====================================================

if "year" in df.columns:
    yearly = (
        df.groupby("year")
        .agg(
            total_articles=("orig_index", "count"),
            ai_articles=("is_ai_llm_flag", lambda x: np.nansum(x.astype(float))),
        )
        .reset_index()
    )
    yearly["ai_share"] = yearly["ai_articles"] / yearly["total_articles"]

    # 5.1 Total vs AI articles per year (stacked bar / line combo)
    fig = px.bar(
        yearly,
        x="year",
        y=["total_articles", "ai_articles"],
        barmode="group",
        title="Total vs AI-related Articles per Year (LLM labels)",
    )
    fig.update_layout(
        xaxis_title="Year",
        yaxis_title="Number of Articles",
        legend_title="Metric",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "04_yearly_total_vs_ai.html"))

    # 5.2 AI share per year (line)
    fig = px.line(
        yearly,
        x="year",
        y="ai_share",
        markers=True,
        title="Share of AI-related Articles per Year (LLM labels)",
    )
    fig.update_layout(
        xaxis_title="Year",
        yaxis_title="AI Share (AI articles / all articles)",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "05_yearly_ai_share.html"))

# =====================================================
# 6. AI share per publisher
# =====================================================

if "publisher" in df.columns:
    ai_df = df.copy()
    ai_df["ai_flag"] = df["is_ai_llm_flag"]

    publisher_stats = (
        ai_df.groupby("publisher")
        .agg(
            total_articles=("orig_index", "count"),
            ai_articles=("ai_flag", lambda x: np.nansum(x.astype(float))),
        )
        .reset_index()
    )
    publisher_stats["ai_share"] = (
        publisher_stats["ai_articles"] / publisher_stats["total_articles"]
    )

    # Focus on publishers with enough volume
    MIN_ARTICLES = 500
    publisher_stats = publisher_stats[publisher_stats["total_articles"] >= MIN_ARTICLES]
    publisher_stats = publisher_stats.sort_values(
        "ai_share", ascending=False
    ).head(20)

    fig = px.bar(
        publisher_stats,
        x="publisher",
        y="ai_share",
        hover_data=["total_articles", "ai_articles"],
        title="Top Publishers by AI Share (LLM labels, min 500 articles)",
    )
    fig.update_layout(
        xaxis_title="Publisher",
        yaxis_title="AI Share",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "06_publisher_ai_share.html"))

# =====================================================
# 7. Topic distribution (ai_topic_llm)
# =====================================================

if "ai_topic_llm" in df.columns:
    topic_counts = (
        df["ai_topic_llm"]
        .astype(str)
        .str.strip()
        .replace("", np.nan)
        .dropna()
        .value_counts()
        .reset_index()
    )
    topic_counts.columns = ["ai_topic_llm", "count"]

    # Focus on top N topics
    TOP_N_TOPICS = 30
    topic_counts_top = topic_counts.head(TOP_N_TOPICS)

    fig = px.bar(
        topic_counts_top,
        x="ai_topic_llm",
        y="count",
        title=f"Top {TOP_N_TOPICS} LLM Topic Labels",
    )
    fig.update_layout(
        xaxis_title="ai_topic_llm (free label)",
        yaxis_title="Number of Articles",
        xaxis_tickangle=45,
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "07_topic_distribution.html"))

# =====================================================
# 8. Topic × Year heatmap
# =====================================================

if "ai_topic_llm" in df.columns and "year" in df.columns:
    # restrict to AI-related articles to get meaningful AI topics
    topic_year = (
        df[df["is_ai_llm_flag"] == True]
        .groupby(["year", "ai_topic_llm"])
        .agg(n=("orig_index", "count"))
        .reset_index()
    )

    # Limit to top topics to keep heatmap readable
    top_topics = (
        topic_year.groupby("ai_topic_llm")["n"].sum().sort_values(ascending=False)
    )
    top_topics = top_topics.head(20).index.tolist()
    topic_year = topic_year[topic_year["ai_topic_llm"].isin(top_topics)]

    heat = topic_year.pivot_table(
        index="year", columns="ai_topic_llm", values="n", fill_value=0
    )

    fig = px.imshow(
        heat,
        labels=dict(x="Topic (ai_topic_llm)", y="Year", color="Number of Articles"),
        title="AI-related Articles by Topic and Year (Top 20 topics)",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "08_topic_year_heatmap.html"))

# =====================================================
# 9. Barrier analysis
# =====================================================

# 9.1 Barrier presence distribution
if "ai_barrier_llm_flag" in df.columns:
    barrier_counts = (
        df["ai_barrier_llm_flag"]
        .value_counts(dropna=False)
        .reset_index()
    )
    barrier_counts.columns = ["ai_barrier_llm_flag", "count"]

    fig = px.bar(
        barrier_counts,
        x="ai_barrier_llm_flag",
        y="count",
        title="Distribution of AI Barrier Presence (LLM)",
    )
    fig.update_layout(
        xaxis_title="ai_barrier_llm_flag (True/False/NaN)",
        yaxis_title="Number of Articles",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "09_barrier_presence_distribution.html"))

# 9.2 Barrier type distribution (for AI-related + has_barrier)
if "ai_barrier_type_llm" in df.columns:
    barrier_type_df = df[
        (df["is_ai_llm_flag"] == True) & (df["ai_barrier_llm_flag"] == True)
    ].copy()

    barrier_type_counts = (
        barrier_type_df["ai_barrier_type_llm"]
        .astype(str)
        .str.strip()
        .replace("", np.nan)
        .dropna()
        .value_counts()
        .reset_index()
    )
    barrier_type_counts.columns = ["ai_barrier_type_llm", "count"]

    fig = px.bar(
        barrier_type_counts,
        x="ai_barrier_type_llm",
        y="count",
        title="Barrier Types in AI-related Articles (LLM labels)",
    )
    fig.update_layout(
        xaxis_title="ai_barrier_type_llm (free label)",
        yaxis_title="Number of Articles",
        xaxis_tickangle=45,
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "10_barrier_type_distribution.html"))

# 9.3 Barrier type × Year heatmap
if {"ai_barrier_type_llm", "year"}.issubset(df.columns):
    barrier_year = (
        df[(df["is_ai_llm_flag"] == True) & (df["ai_barrier_llm_flag"] == True)]
        .groupby(["year", "ai_barrier_type_llm"])
        .agg(n=("orig_index", "count"))
        .reset_index()
    )

    # Focus on top barrier types
    top_barriers = (
        barrier_year.groupby("ai_barrier_type_llm")["n"]
        .sum()
        .sort_values(ascending=False)
        .head(20)
        .index
        .tolist()
    )
    barrier_year = barrier_year[barrier_year["ai_barrier_type_llm"].isin(top_barriers)]

    heat_b = barrier_year.pivot_table(
        index="year", columns="ai_barrier_type_llm", values="n", fill_value=0
    )

    fig = px.imshow(
        heat_b,
        labels=dict(
            x="Barrier Type (ai_barrier_type_llm)",
            y="Year",
            color="Number of Articles",
        ),
        title="Barrier Types in AI-related Articles by Year (Top 20)",
    )
    fig.show()
    fig.write_html(os.path.join(FIG_DIR, "11_barrier_type_year_heatmap.html"))

print("\nAll figures saved under:", FIG_DIR)


Shape: (7097, 18)
Columns: ['plain_text', 'published_date', 'title', 'tags', 'categories', 'author', 'sitename', 'publisher', 'keyword_score', 'ai_score', 'is_ai', 'orig_index', 'is_ai_llm', 'ai_relevance_llm', 'ai_topic_llm', 'ai_barrier_llm', 'ai_barrier_type_llm', 'ai_barrier_summary_llm']

=== Basic info ===
<class 'pandas.core.frame.DataFrame'>
Index: 7097 entries, 1 to 1785
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   plain_text              7097 non-null   object        
 1   published_date          7097 non-null   datetime64[ns]
 2   title                   7097 non-null   object        
 3   tags                    1626 non-null   object        
 4   categories              4384 non-null   object        
 5   author                  6566 non-null   object        
 6   sitename                7097 non-null   object        
 7   publisher               7097 non-null  


All figures saved under: figures_ai_llm_overview
