## Skab database

In [14]:

import sqlite3

conn = sqlite3.connect('database.db')
c = conn.cursor()


c.execute("""CREATE TABLE IF NOT EXISTS articles (
            headline text,  
            URL text UNIQUE,
            date text,
            is_nyhed integer,
            type_article text,
            text_body text,
            countries text,
            KM_index integer, 
            language text,
            media text,
            processed integer
            )""")


conn.commit()
conn.close()



## HjÃ¦lpefunktioner

In [None]:
# GET COLUMN NAMES

conn = sqlite3.connect('database.db')
c = conn.execute('select * from articles')

# Fetch and print column names
column_names = [description[0] for description in c.description]
print("Column Names:", column_names)

conn.close()

Column Names: ['headline', 'URL', 'date', 'is_nyhed', 'type_article', 'text_body', 'countries', 'KM_index', 'language', 'media', 'processed']


In [2]:
# GET 10 ITEMS
import sqlite3

conn = sqlite3.connect('database.db')
c = conn.execute('SELECT countries, language, media, processed FROM articles')
#WHERE media = "DR"'
rows = c.fetchmany(10)
for row in rows:
    print(row)

conn.close()

('Germany, Israel, Palestine, Republic of Serbia, Ukraine, United States of America', 'German', 'ARD', 1)
('Egypt, Germany, Israel, Palestine', 'German', 'ARD', 1)
('France', 'German', 'ARD', 1)
('Israel, Palestine', 'English', 'BBC', 1)
('Russia, United Kingdom, United States of America', 'English', 'BBC', 1)
('Egypt, France, Israel, Italy, Palestine, Spain, United Kingdom', 'English', 'BBC', 1)
('Russia, Ukraine', 'English', 'BBC', 1)
('Ecuador, Russia, Spain, United States of America', 'English', 'BBC', 1)
('China, Denmark, United States of America', 'Danish', 'DR', 1)
('Denmark', 'Danish', 'DR', 1)


In [6]:
import sqlite3

conn = sqlite3.connect('database.db')
c = conn.cursor()

# Delete all rows from the table
c.execute("SELECT DISTINCT date FROM articles")
dates = c.fetchall()

print(dates)

# Commit the changes
conn.commit()
conn.close()

[('2025-10-11',), ('2025-10-12',), ('2025-10-13',), ('2025-10-14',)]


In [7]:
# DELETE BAD BBC DATA
import sqlite3

# Connect to the database
conn = sqlite3.connect('database.db')
c = conn.cursor()

# The date '10-11' is interpreted as '2025-10-11' based on your date list.
# Correct SQL uses single quotes for string values and the AND keyword
# to combine conditions.
c.execute("DELETE FROM articles WHERE media = 'BBC' AND date = '2025-10-11'")

# Commit the changes to permanently delete the rows
conn.commit()
conn.close()

In [2]:
#  NUMBER OF ITEMS
# last fetch gave 192 12/10
import sqlite3

conn = sqlite3.connect('database.db')
c = conn.cursor()

# Delete all rows from the table
c.execute("SELECT * FROM articles")
print(len(c.fetchall()))

# Commit the changes
conn.commit()
conn.close()

573


In [12]:
# Get specifik article

import sqlite3
url = 'https://www.bbc.com/news/articles/cvgqgl2dxweo'

conn = sqlite3.connect('database.db')
c = conn.cursor()

# Delete all rows from the table
c.execute("SELECT text_body FROM articles WHERE url = ?", (url,))
print((c.fetchone()))

# Commit the changes
conn.commit()
conn.close()

('Indian politicians and journalists have criticised the government for failing to speak out after female journalists were excluded from a press event with the Afghan Taliban foreign minister in Delhi. Around 16 male reporters were selected to attend a forum on Friday with Foreign Minister Amir Khan Muttaqi at the Afghan embassy. Journalists observed women and foreign media being turned away. India\'s Ministry of External Affairs (MEA) said it "had no involvement in the press interaction" at the Afghan embassy. A source in the Taliban government admitted women had not been invited to attend. They told the BBC "female journalists were excluded due to lack of proper coordination and will be invited to next conference if held in Delhi".  Opposition leader Rahul Gandhi said by allowing the event to go ahead, India\'s Prime Minister Narendra Modi was "telling every woman in India that you are too weak to stand up for them". The Editors Guild of India strongly condemned the exclusion and sai

In [15]:
# set all items to not processed
import sqlite3

conn = sqlite3.connect('database.db')
c = conn.cursor()

c.execute("UPDATE articles SET processed = 0, countries = ''")


conn.commit()
conn.close()

## Find unikke lande i artiklen

In [2]:
import spacy
import demonym

#  SETUP (RUN ONCE) 
MODEL_MAP = {
    "Danish": "da_core_news_sm",
    "English": "en_core_web_sm",
    "German": "de_core_news_sm"
}

LOOKUP_MAP = {
    "Danish": demonym.DANISH, 
    "English": demonym.ENGLISH, 
    "German": demonym.GERMAN
}


loaded_models = {}
for lang, model_name in MODEL_MAP.items():
    loaded_models[lang] = spacy.load(model_name)
        
   

In [3]:

import sqlite3
import analyze


conn = sqlite3.connect('database.db')
# Set row_factory to access columns by name, which is easier to read
conn.row_factory = sqlite3.Row 
c = conn.cursor()


c.execute('SELECT URL, text_body, language FROM articles WHERE processed = 0')
articles_to_process = c.fetchall() 

   
for n, article in enumerate(articles_to_process):
    url = article['URL']
    text = article['text_body']
    language = article['language']

    nlp = loaded_models.get(language)
    lookup = LOOKUP_MAP.get(language)
    doc = nlp(text)
    countries_found = analyze.find_countries(doc, lookup)

    if countries_found:
        print(f"Updating {url} -> Countries: {countries_found}")
        c.execute("""
            UPDATE articles 
            SET countries = ?, processed = 1 WHERE URL = ?""", 
            (countries_found, url))
        
    else:
        c.execute("UPDATE articles SET processed = 1 WHERE URL = ?", (url,))

conn.commit()
print("Database has been updated successfully.")
print(f"Processed {n} articles")
conn.close()
        

Updating https://www.bbc.com/news/articles/cr430epq45go -> Countries: Israel, Palestine
Updating https://www.bbc.com/news/articles/cd9kd2e073wo -> Countries: Canada, China, India, Singapore
Updating https://www.bbc.com/news/articles/ce3x1qrd2kno -> Countries: Germany, Ireland
Updating https://www.bbc.com/sport/cricket/articles/crexdqd9zxdo -> Countries: Australia, India
Updating https://www.bbc.com/sport/snooker/articles/cze65163222o -> Countries: China, United Kingdom
Updating https://www.bbc.com/sport/football/articles/cr7mr47gzxyo -> Countries: Israel, Norway
Updating https://www.dr.dk/sporten/seneste-sport/dansker-skyder-sig-til-vm-bronze -> Countries: Austria, Belgium, Croatia, Czechia, Denmark, Finland, France, Germany, Greece, Indonesia, Israel, Italy, Japan, Netherlands, New Zealand, Norway, Palestine, Portugal, Slovenia, Spain, Sweden, Taiwan, Thailand, Ukraine, United States of America
Updating https://www.tagesschau.de/ausland/asien/trump-rede-knesset-100.html -> Countries: 

In [None]:
# demonymerne er lidt rigtige i det mindste

danske_lande = (set(demonym.DANISH.values()))
engelske_lande = (set(demonym.ENGLISH.values()))
tyske_lande = (set(demonym.GERMAN.values()))

print(danske_lande == engelske_lande == tyske_lande)




True


## Generating datafiles

In [4]:
import sqlite3
import csv

conn = sqlite3.connect('database.db')
conn.row_factory = sqlite3.Row 
c = conn.cursor()

# URL as a unique ID.
c.execute('SELECT URL, media, is_nyhed, type_article, date, countries FROM articles')
all_articles = c.fetchall()  # change to fetchall


output_filename = 'export_data/articles_long_format_v1.csv'
header = ['URL', 'media', 'is_nyhed', 'type_article', 'date', 'countries']

with open(output_filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    
    # Process each article from the database
    for article in all_articles:
        # The 'countries' column might be None or empty
        if not article['countries']:
            writer.writerow([
                article['URL'],
                article['media'],
                article['is_nyhed'],
                article['type_article'],
                article['date'],
                'NO_COUNTRIES'
            ])

        
        else:
            writer.writerow([
                article['URL'],
                article['media'],
                article['is_nyhed'],
                article['type_article'],
                article['date'],
                article['countries']
            ])

conn.close()