In [1]:
import os
import pandas as pd
import os
import pandas as pd
import numpy as np
import re
import torch
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
import plotly.express as px
import os
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import json

# ----------------------------------------------------
# 0) Load all results from your results/ folder
# ----------------------------------------------------
# NOTE: Using a placeholder base_dir. Ensure this path is correct in your environment.
base_dir = r"C:\Users\qswwq\Documents\semesterproject\results"
df = pd.read_csv(os.path.join(base_dir, "02_ai_scores.csv"))
emb = np.load(os.path.join(base_dir, "01_embeddings.npy"))
df = df[df['is_ai'] == True].copy()
df = df[df['ai_score'] >= 0.5].copy()
emb = emb[df.index]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
event_queries = {
    "amateur_football_match_report":                     "Robotgenereret kampreferat fra dansk fodbold i 2. division med stilling og næste kamp",
    "taxation_system_criticism":                         "Skarpt debatindlæg der kritiserer ny ejendomsvurdering og hele det danske skattesystem",
    "ai_job_displacement":                               "Artikel om ChatGPT og AI der truer med at erstatte programmører, journalister og lærere",
    "gender_power_dynamics_opinion":                     "Kontroversielt indlæg om kønsmagt, feminisme, samtykkeloven og kvinders ansvar",
    "ai_ethics_and_society":                             "Microsofts syn på kunstig intelligens, etik, gennemsigtighed og ansvarlig udvikling",
    "art_exhibition_event":                              "Lokal kunstudstilling i Glostrup med maler Dag Aronson og jazzkoncert",
    "political_accountability_report":                   "Politisk kommentar om Mette Frederiksens ansvar og Minkkommissionens rapport",
    "it_industry_newsletter":                            "Dansk IT-branche nyhedsbrev med sikkerhed, machine learning og jobannoncer",
    "robotic_automation_startup":                        "Succeshistorie om Odense-robotvirksomhed Nordbo Robotics og industriel automatisering",
    "ai_employment_impact_list":                         "Liste over jobs der mest sandsynligt bliver erstattet af ChatGPT og generativ AI",
    "danish_political_scandal_analysis":                 "Dybdegående analyse af stor dansk politisk skandale med statsminister og officiel undersøgelse"
}

In [3]:
device = "cpu"
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
event_emb = model.encode(list(event_queries.values()), normalize_embeddings=True)
event_scores = util.cos_sim(emb, event_emb).cpu().numpy()


In [4]:
df["event_type"] = [list(event_queries.keys())[i] for i in event_scores.argmax(axis=1)]

df.to_csv("results/03_event_types.csv", index=False)

In [5]:
df.columns


Index(['plain_text', 'published_date', 'title', 'tags', 'categories', 'author',
       'sitename', 'publisher', 'ai_score', 'is_ai', 'event_type'],
      dtype='object')

In [7]:
import kaleido
fig_dir = r"C:\Users\qswwq\Documents\semesterproject\figures"
df_ai = df[df["is_ai"] == True]         # <-- FIXED
event_counts = df_ai["event_type"].value_counts()
df["year"] = pd.to_datetime(df["published_date"]).dt.year
trend = df.groupby("year").size().reset_index(name="count")

# ----------------------------------------
# 1. AI Trend by Year
# ----------------------------------------
fig1 = px.line(
    trend.reset_index(),
    x="year",
    y="count",
    markers=True,
    title="AI-Related News Trend in Denmark (2016–2024)"
)
fig1.update_layout(
    xaxis_title="Year",
    yaxis_title="Number of AI Articles"
)
fig1.show()
fig1.write_image(os.path.join(fig_dir, "01_ai_trend_year.png"))

# ----------------------------------------
# 2. AI Event Type Distribution
# ----------------------------------------
fig2 = px.pie(
    names=event_counts.index,
    values=event_counts.values,
    title="Distribution of AI News Event Types"
)
fig2.show()
fig2.write_image(os.path.join(fig_dir, "02_ai_event_pie.png"))

fig2b = px.bar(
    x=event_counts.values,
    y=event_counts.index,
    orientation="h",
    title="AI News Article Count by Event Type",
    labels={"x": "Articles", "y": "Event Type"}
)
fig2b.show()
fig2b.write_image(os.path.join(fig_dir, "02_ai_event_bar.png"))

# ----------------------------------------
# 3. Company Mention Heatmap
# ----------------------------------------
heat = df_ai.pivot_table(
    index="year",
    columns="publisher",
    values="title",          # or "plain_text", doesn't matter
    aggfunc="count",
    fill_value=0
)
fig3 = px.imshow(
    heat,
    text_auto=True,
    color_continuous_scale="YlOrRd",
    title="Company Mentions in AI News (Year × Company)"
)
fig3.update_layout(
    xaxis_title="Company",
    yaxis_title="Year"
)
fig3.show()
fig3.write_image(os.path.join(fig_dir, "03_company_year_heatmap.png"))

# ----------------------------------------
# 4. Top 15 Cities by AI News
# ----------------------------------------
top_cities = city_ai.head(15)

fig4 = px.bar(
    data_frame=top_cities,
    x="count",
    y="city",
    orientation="h",
    title="Top 15 Cities by AI-Related News Count",
    labels={"count": "Articles", "city": "City"},
    color="count",
    color_continuous_scale="Blues"
)
fig4.show()
fig4.write_image(os.path.join(fig_dir, "04_ai_city_top15.png"))

# ----------------------------------------
# 5. AI Semantic Similarity Score Distribution
# ----------------------------------------
fig5 = px.histogram(
    ai_scores,
    x="ai_score",
    nbins=50,
    title="Semantic Similarity Scores to ‘AI’ Query",
    opacity=0.75
)

fig5.add_vline(
    x=0.32,
    line_dash="dash",
    line_color="red",
    annotation_text="Threshold 0.32"
)

fig5.update_layout(
    xaxis_title="AI Similarity Score",
    yaxis_title="Count"
)
fig5.show()
fig5.write_image(os.path.join(fig_dir, "05_ai_score_distribution.png"))

# ----------------------------------------
# 6. PCA Visualization for AI Articles
# ----------------------------------------
# Event label → numeric mapping
event_map = {v: i for i, v in enumerate(df_ai["event_type"].unique())}

fig6 = px.scatter(
    topic_pca,
    x="pc1",
    y="pc2",
    color=df_ai["event_type"].map(event_map),
    title="PCA Projection of AI News (SBERT Embeddings)",
    labels={"pc1": "PC1", "pc2": "PC2", "color": "Event Type"},
    opacity=0.7
)
fig6.show()
fig6.write_image(os.path.join(fig_dir, "06_ai_pca_clusters.png"))

# ----------------------------------------
# 7. Save Statistical Summary
# ----------------------------------------
summary_path = os.path.join(base_dir, "final_statistics_summary.txt")

with open(summary_path, "w", encoding="utf-8") as f:
    f.write("=== Denmark News (2016–2024) – AI Coverage Summary ===\n\n")
    f.write(f"Total articles               : {len(df):,}\n")
    f.write(f"AI-related articles          : {df['is_ai'].sum():,} ({df['is_ai'].mean():.1%})\n")

    f.write("\nAI Event Type Distribution:\n")
    for typ, cnt in event_counts.items():
        f.write(f"  - {typ}: {cnt} articles\n")

    f.write(f"\nMost mentioned company       : {comp_stats.iloc[0].name} ({comp_stats.iloc[0,0]} mentions)\n")
    f.write(f"City with most AI coverage   : {city_ai.iloc[0]['city']} ({city_ai.iloc[0]['count']} articles)\n")
    f.write(f"Peak year of AI reporting    : {trend['count'].idxmax()} ({trend['count'].max()} articles)\n")

print("All Plotly figures saved to:", fig_dir)
print("Statistical summary saved to:", summary_path)

NameError: name 'city_ai' is not defined