In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/danishnews.csv')


Mounted at /content/drive


In [None]:
!pip install spacy
!python -m spacy download da_core_news_lg

Collecting da-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.8.0/da_core_news_lg-3.8.0-py3-none-any.whl (567.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.1/567.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: da-core-news-lg
Successfully installed da-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('da_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
AI_KEYWORDS = [
    "AI", "kunstig intelligens", "maskinlæring", "dyb læring", "neural netværk", "automatisering",
                  "robotik", "dataanalyse", "algoritme", "intelligente systemer", "GPT", "OPENAI", "LLM", "chatbot",
                    "sprogmodel", "generativ AI", "AI-assistent", "AI-drevet", "computer vision", "naturlig sprogbehandling",
                    "AI-platform", "AI-teknologi", "AI-forskning", "AI-innovation", "AI-applikationer", "AI-løsninger",
                    "AI-udvikling", "AI-sikkerhed", "AI-etik", "AI-regulering", "AI-politik", "AI-strategi", "AI-investering",
                    "AI-startup", "AI-industrien", "AI-marked", "AI-trends", "AI-fremtid", "robotter", "automatiserede systemer",
                    "intelligente maskiner", "AI-integration", "AI-implementering", "AI-optimering", "AI-overvågning"]


def is_ai_article(text):
    if not isinstance(text, str):
        return False
    t = text.lower()
    return any(k in t for k in AI_KEYWORDS)

df["is_ai"] = df["plain_text"].apply(is_ai_article)

df["is_ai"].value_counts()


Unnamed: 0_level_0,count
is_ai,Unnamed: 1_level_1
False,1195277
True,5179


In [None]:
df_ai = df[df["is_ai"]].copy()

print("AI-related articles:", df_ai.shape[0])


AI-related articles: 5179


In [None]:
ai_df = df[df["is_ai"] == True].copy()


In [None]:
MEDIA_PATTERNS = [
    "avis", "blad", "bt", "b.t", "berlingske", "politiken",
    "jyllands", "reuters", "ritzau", "afp", "bbc", "cnn",
    "tv2", "dr", ".dk", "radio","DR", "TV2", "TV 2", "Berlingske", "Politiken", "Jyllands-Posten", "Ekstra Bladet",
    "BT", "Information", "Kristeligt Dagblad", "Weekendavisen", "Børsen",
    "Ritzau", "Altinget", "Version2", "Computerworld", "Ingeniøren",
]

SPORTS_PATTERNS = [
    "fc ", "f.c", "aab", "agf", "dbu", "superliga",
    "league", "champions", "united", "city", "boldklub",
]

PUBLIC_PATTERNS = [
    "styrelsen", "institut", "ministerium", "direktorat",
    "politi", "region", "kommune", "folketing", "universitet","Socialdemokratiet", "Venstre", "Dansk Folkeparti", "DF", "Radikale Venstre",
    "SF", "Socialistisk Folkeparti", "Enhedslisten", "Alternativet", "Konservative",
    "Liberal Alliance", "Moderaterne", "Danmarksdemokraterne",
]

def classify_actor(org):
    o = org.lower().strip()

    if any(p in o for p in MEDIA_PATTERNS):
        return "Media"

    if any(p in o for p in SPORTS_PATTERNS):
        return "Sports"

    if any(p in o for p in PUBLIC_PATTERNS):
        return "Public Authority"

    return "Company"

In [None]:
import spacy
from collections import Counter
import re

# Load the Danish model
nlp = spacy.load("da_core_news_lg")

# Expanded blacklist based on observed noise and recommendations
NON_COMPANY_KEYWORDS = {
    # Major Danish media & science media
    "ekstra bladet", "b.t.", "berlingske", "politiken", "jyllands-posten", "bt", "tv2", "tv 2", "dr",
    "information", "weekendavisen", "børsen", "ritzau", "altinget", "version2", "computerworld",
    "ingeniøren", "frederiksborg amts avis", "sjællandske", "nordvestnyt", "sn.dk", "jv.dk",
    "videnskab.dk", "videnskab", "tech & viden", "finans",

    # International media
    "reuters", "afp", "bbc", "cnn", "new york times", "washington post", "the guardian",
    "bloomberg", "cnbc",

    # Political parties
    "venstre", "socialdemokratiet", "dansk folkeparti", "df", "radikale venstre", "sf",
    "socialistisk folkeparti", "enhedslisten", "alternativet", "konservative", "liberal alliance",
    "moderaterne", "danmarksdemokraterne",

    # Government & public institutions / associations
    "folketinget", "regeringen", "eu", "eu-kommissionen", "kommissionen", "ministeriet", "styrelsen",
    "direktoratet", "politi", "nordsjællands politi", "kommune", "region", "domstol", "ret", "retten",
    "landsret", "højesteret", "universitet", "ku", "københavns universitet", "dtu", "danmarks tekniske universitet",
    "cbs", "copenhagen business school", "sdu", "syddansk universitet", "aau", "aarhus universitet",
    "dansk industri", "norden", "nasdaq",

    # International orgs & exchanges
    "nato", "fn", "who", "oecd", "verdensbanken",

    # Events & generic
    "summit", "copy",

    # Sports
    "fc", "f.c.", "brøndby", "fck", "agf", "aab", "superliga", "champions league", "dbu",

    # Obvious noise
    "corona", "covid", "sars-cov-2", "musk"
}

# Generic terms that are almost never companies
GENERIC_TERMS = {
    "gruppen", "foreningen", "rådet", "udvalget", "kommissionen", "myndighederne",
    "domstolen", "parlamentet", "sprognævnet"
}

def is_likely_company(org_name):
    org_lower = org_name.lower().strip()

    # Reject if matches any blacklisted keyword
    if any(kw in org_lower for kw in NON_COMPANY_KEYWORDS):
        return False

    # Reject generic organizational terms
    if any(gen in org_lower for gen in GENERIC_TERMS):
        return False

    # Reject very short acronyms (common for public bodies)
    if len(org_name) <= 3 and org_name.isupper():
        return False

    # Strong positive signal: Danish company suffixes
    if re.search(r'\b(a/s|ap s|aps|ivs|gmbh|inc|ltd|s/a)\b', org_name, re.IGNORECASE):
        return True


    # Default: assume it's a company (conservative to avoid missing obscure Danish firms)
    return True

def extract_companies(text, max_chars=2000):
    if not isinstance(text, str):
        return []
    doc = nlp(text[:max_chars])
    companies = set()
    for ent in doc.ents:
        if ent.label_ == "ORG":
            name = ent.text.strip()
            if len(name) > 2 and is_likely_company(name):
                companies.add(name)
    return list(companies)

# === APPLY TO YOUR AI-FILTERED DATASET (ai_df) ===
ai_df["company_entities"] = ai_df["plain_text"].apply(extract_companies)

# Get all unique companies across all articles
all_unique_companies = set()
for entities in ai_df["company_entities"]:
    all_unique_companies.update(entities)

print(f"Number of unique likely companies: {len(all_unique_companies)}")

# Most frequently mentioned companies
company_counter = Counter(
    company for sublist in ai_df["company_entities"] for company in sublist
)

print("\nTop 30 most mentioned companies:")
for company, count in company_counter.most_common(30):
    print(f"{count:4d} × {company}")

Number of unique likely companies: 7095

Top 30 most mentioned companies:
 198 × Microsoft
 196 × Twitter
 179 × Facebook
 166 × Apple
 143 × ChatGPT
 104 × Amazon
  94 × Instagram
  86 × Nvidia
  70 × Netflix
  64 × Universal Robots
  58 × Tesla
  57 × Samsung
  41 × Microsofts
  41 × Alphabet
  41 × Mobile Industrial Robots
  36 × Nvidias
  31 × Blue Ocean Robotics
  29 × Spotify
  27 × Samsungs
  27 × Cambridge Analytica
  27 × Binance
  25 × 2023
  24 × NASA
  24 × Dansk Erhverv
  24 × Teknologisk Institut
  23 × Snapchat
  22 × Kristeligt Dagblad
  22 × Mærsk
  21 × Internet of Things
  21 × Odense Robotics
