In [4]:
import glob
import os
import polars as pl

In [5]:
# List all of the publishers
def print_all_english_publishers(folder_path: str):
    """
    1. Scans all .parquet files in `folder_path` using Polars' lazy mode.
    2. Filters rows where language == 'en'.
    3. Collects *unique* publishers and prints them.
    """
    parquet_files = glob.glob(os.path.join(folder_path, "*.parquet"))
    if not parquet_files:
        print(f"No Parquet files found in: {folder_path}")
        return

    all_english_publishers = set()

    for pq_file in parquet_files:
        print(f"Scanning file: {pq_file}")
        # Lazily read each Parquet file
        lazy_df = pl.scan_parquet(pq_file)

        # Filter for rows in English and collect unique publishers
        pubs_df = (
            lazy_df
            .filter(pl.col("language") == "en")
            .select(pl.col("publisher").unique())
            .collect()
        )
        # Extract the publishers from the Polars DataFrame column
        pubs_list = pubs_df["publisher"].to_list()
        # Add them to our global set (avoid duplicates across multiple files)
        all_english_publishers.update(pubs_list)

    # Print all unique publishers that publish in English
    if all_english_publishers:
        print("\nAll publishers that publish in English:")
        for pub in sorted(all_english_publishers):
            print(pub)
    else:
        print("\nNo publishers found that publish in English.")
def main():
    # Replace with your actual directory path.
    # Use an r-string (raw string), escaped backslashes, or forward slashes on Windows.
    folder_path = r"C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024"

    print_all_english_publishers(folder_path)

if __name__ == "__main__":
    main()


Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0000.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0001.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0002.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0003.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0004.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0005.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0006.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0007.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0008.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0009.parquet
Scanning file: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0010.parquet
Scanning file: C:\Use

In [6]:
# Dictionary of all publisher with 1500+ articles and the number of articles they have
import glob
import os
import polars as pl

def count_english_articles_by_publisher(folder_path: str):
    """
    1. For each .parquet file in `folder_path`, read lazily with Polars.
    2. Filter rows where language == 'en'.
    3. Group by 'publisher' and count how many English articles each appears.
    4. Accumulate counts into a global dictionary.
    5. Filter publishers with more than 1500 articles, sort them, and print the result.
    """
    parquet_files = glob.glob(os.path.join(folder_path, "*.parquet"))
    if not parquet_files:
        print(f"No Parquet files found in: {folder_path}")
        return

    counts_dict = {}

    for pq_file in parquet_files:
        print(f"Processing: {pq_file}")

        # Lazily scan the Parquet file
        lazy_df = pl.scan_parquet(pq_file)

        # Filter for rows where language == 'en', group by publisher, and count
        grouped_df = (
            lazy_df
            .filter(pl.col("language") == "en")
            .group_by("publisher")
            .agg(pl.count("publisher").alias("count"))
            .collect()  # Execute the lazy query
        )

        # Update counts in the dictionary
        for row in grouped_df.to_dicts():
            pub = row["publisher"]
            cnt = row["count"]
            counts_dict[pub] = counts_dict.get(pub, 0) + cnt

    # Filter out publishers with 1500 or fewer articles
    filtered_dict = {pub: cnt for pub, cnt in counts_dict.items() if cnt > 1500}

    if not filtered_dict:
        print("No publishers found with more than 1500 English articles.")
        return

    # Sort publishers by count of English articles (descending)
    sorted_counts = sorted(filtered_dict.items(), key=lambda x: x[1], reverse=True)

    # Convert sorted list back into a dictionary for ordered printing
    sorted_dict = dict(sorted_counts)

    print("\nPublishers with more than 1500 English articles sorted by article count (descending):")
    print(sorted_dict)
    return(sorted_dict)


# Set this to your folder path containing the Parquet files.
folder_path = r"C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024"  # Adjust path as needed.
dictionary = count_english_articles_by_publisher(folder_path)

Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0000.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0001.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0002.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0003.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0004.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0005.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0006.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0007.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0008.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0009.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0010.parquet
Processing: C:\Users\Admin\PycharmProjects\AI-Projects

We manually chose news sources we thought were valuable to analyze, based on the country of origin, reputation and the activity of the newspaper.

In [7]:
publishers = ["abcactionnews.com", "abcnews.go.com", "africanews.com", "aljazeera.com", "azernews.az", "bnnbloomberg.ca", "bnnbreaking.com", "bostonglobe.com", "ca.news.yahoo.com", "cambridge-news.co.uk", "cbc.ca", "cbsnews.com", "chicago.suntimes.com", "chicagotribune.com", "cnbc.com", "cnn.com", "cyprus-mail.com", "dailymail.co.uk", "dailypost.co.uk", "dailysabah.com", "dailystar.co.uk", "dw.com", "edinburghlive.co.uk", "eurasiareview.com", "euronews.com", "forbes.com", "foxnews.com", "gazettelive.co.uk", "heraldscotland.com", "hindustantimes.com", "huffingtonpost.co.uk", "huffpost.com", "hurriyetdailynews.com", "independent.co.uk", "india.com", "indiatimes.com", "inews.co.uk", "irishmirror.ie", "irishtimes.com", "israelnationalnews.com", "japannews.yomiuri.co.jp", "japantimes.co.jp", "jewishpress.com", "jpost.com", "kyivindependent.com", "kyivpost.com", "nbcnews.com", "newsday.com", "northernirelandworld.com", "northumberlandgazette.co.uk", "nottinghampost.com", "npr.org", "ntv.ca", "nysun.com", "nzherald.co.nz", "palestinechronicle.com", "politico.eu", "pressandjournal.co.uk", "pressherald.com", "romania-insider.com", "sfstandard.com", "sputnikglobe.com", "standard.net.au", "telegraph.co.uk", "the-independent.com", "thedailystar.net", "thefederal.com", "theglobeandmail.com", "thehindu.com", "thesouthafrican.com", "thestar.co.uk", "thestatesman.com", "thesun.my", "theweek.com", "time.com", "timeslive.co.za", "timesofindia.indiatimes.com", "toronto.citynews.ca", "tribuneindia.com", "turan.az", "yahoo.com", "themoscowtimes.com", "globalnews.ca","syrianobserver.com","tass.com","rt.com"]

for key, value in sorted(dictionary.items(), key=lambda x: x[1], reverse=True):
    if key in publishers:
        print(key, value)

bnnbreaking.com 221938
dailymail.co.uk 193599
yahoo.com 104867
timesofindia.indiatimes.com 100280
hindustantimes.com 77636
thehindu.com 60483
cbsnews.com 44200
forbes.com 41495
independent.co.uk 36282
ca.news.yahoo.com 28310
dailystar.co.uk 27886
nzherald.co.nz 25685
telegraph.co.uk 25499
foxnews.com 22766
bnnbloomberg.ca 21496
thedailystar.net 20037
globalnews.ca 16648
jpost.com 13870
theglobeandmail.com 13759
irishtimes.com 13745
irishmirror.ie 12951
pressandjournal.co.uk 12021
tribuneindia.com 10703
inews.co.uk 10394
indiatimes.com 10069
thestatesman.com 9764
nottinghampost.com 9171
abcnews.go.com 8643
india.com 8486
israelnationalnews.com 7763
toronto.citynews.ca 7645
bostonglobe.com 7353
tass.com 7218
pressherald.com 7201
timeslive.co.za 7170
cyprus-mail.com 6799
thestar.co.uk 6782
euronews.com 6569
rt.com 6560
aljazeera.com 6384
sputnikglobe.com 6115
kyivindependent.com 6098
azernews.az 5730
chicago.suntimes.com 5704
nbcnews.com 5664
dailysabah.com 5655
heraldscotland.com 5561
hu

In [10]:
import io
import glob
import os
import polars as pl

def selected_pubs_to_csv(folder_path: str, publishers: list, csv_filepath: str = "2024.csv"):
    parquet_files = glob.glob(os.path.join(folder_path, "*.parquet"))
    if not parquet_files:
        print(f"No Parquet files found in: {folder_path}")
        return None  # Return None if no files found

    for pq_file in parquet_files:
        print(f"Processing: {pq_file}")

        # Lazily scan the Parquet file
        lazy_df = pl.scan_parquet(pq_file)

        # Filter rows where language == 'en' and publisher is in the publishers list
        filtered_lazy_df = (
            lazy_df
            .filter((pl.col("language") == "en") & pl.col("publisher").is_in(publishers))
        )

        # Collect the filtered lazy DataFrame into a real DataFrame
        df = filtered_lazy_df.collect()


        # Check if file exists
        if not os.path.exists(csv_filepath):
            # If it doesn't exist, create it with headers
            df.write_csv(csv_filepath)
            print(f"File {pq_file} created at: {csv_filepath}")
        else:
            # 1) Write CSV into an in-memory buffer (includes header by default)
            buffer = io.StringIO()
            df.write_csv(buffer)
    
            # 2) Get full CSV text
            csv_str = buffer.getvalue()
    
            # 3) Strip off the first line (the header row)
            lines = csv_str.split('\n')
            # If the CSV isn’t empty, drop the first line.
            # Also be cautious about trailing newlines
            data_only = "\n".join(lines[1:]).strip('\n')
    
            # 4) Append data rows to the existing file
            with open(csv_filepath, 'a', encoding='utf-8') as f:
                # Optionally ensure a newline if the file doesn’t already end with one
                # You might want to check the last character in the file, 
                # but typically adding a newline is enough:
                f.write('\n' + data_only)

        print(f"Appended {pq_file} to existing file at: {csv_filepath}")
            
selected_pubs_to_csv(r'C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024', publishers, "2024.csv")

Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0000.parquet
File C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0000.parquet created at: 2024.csv
Appended C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0000.parquet to existing file at: 2024.csv
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0001.parquet
Appended C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0001.parquet to existing file at: 2024.csv
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0002.parquet
Appended C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0002.parquet to existing file at: 2024.csv
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0003.parquet
Appended C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0003.parquet to existing file at: 2024.csv
Processing: C:\Users\Admin\PycharmProjects\AI-Projects\CC_news\2024\2024_0004.parquet
Appended C:\Users\

In [11]:
# 1. Lazily read the CSV file
lazy_df = pl.scan_csv("2024.csv")

# 2. Group by the "publisher" column and count
counts_lazy = (
    lazy_df
    .group_by("publisher")
    .agg(pl.count())  # or .agg(pl.col("*").count()) to count rows
    .sort("count", descending=True)
)

# 3. Execute the lazy plan (collect) to get the result
counts_df = counts_lazy.collect()

# 4. Print or inspect the resulting DataFrame
pl.Config.set_tbl_rows(100_000)
print(counts_df)


  .agg(pl.count())  # or .agg(pl.col("*").count()) to count rows


shape: (87, 2)
┌─────────────────────────────┬────────┐
│ publisher                   ┆ count  │
│ ---                         ┆ ---    │
│ str                         ┆ u32    │
╞═════════════════════════════╪════════╡
│ bnnbreaking.com             ┆ 221938 │
│ dailymail.co.uk             ┆ 193599 │
│ yahoo.com                   ┆ 104867 │
│ timesofindia.indiatimes.com ┆ 100280 │
│ hindustantimes.com          ┆ 77636  │
│ thehindu.com                ┆ 60483  │
│ cbsnews.com                 ┆ 44200  │
│ forbes.com                  ┆ 41495  │
│ independent.co.uk           ┆ 36282  │
│ ca.news.yahoo.com           ┆ 28310  │
│ dailystar.co.uk             ┆ 27886  │
│ nzherald.co.nz              ┆ 25685  │
│ telegraph.co.uk             ┆ 25499  │
│ foxnews.com                 ┆ 22766  │
│ bnnbloomberg.ca             ┆ 21496  │
│ thedailystar.net            ┆ 20037  │
│ globalnews.ca               ┆ 16648  │
│ jpost.com                   ┆ 13870  │
│ theglobeandmail.com         ┆ 13759  │
│

In [12]:
lst = pl.Series(counts_df.select('publisher')).to_list()
print(lst)

['bnnbreaking.com', 'dailymail.co.uk', 'yahoo.com', 'timesofindia.indiatimes.com', 'hindustantimes.com', 'thehindu.com', 'cbsnews.com', 'forbes.com', 'independent.co.uk', 'ca.news.yahoo.com', 'dailystar.co.uk', 'nzherald.co.nz', 'telegraph.co.uk', 'foxnews.com', 'bnnbloomberg.ca', 'thedailystar.net', 'globalnews.ca', 'jpost.com', 'theglobeandmail.com', 'irishtimes.com', 'irishmirror.ie', 'pressandjournal.co.uk', 'tribuneindia.com', 'inews.co.uk', 'indiatimes.com', 'thestatesman.com', 'nottinghampost.com', 'abcnews.go.com', 'india.com', 'israelnationalnews.com', 'toronto.citynews.ca', 'bostonglobe.com', 'tass.com', 'pressherald.com', 'timeslive.co.za', 'cyprus-mail.com', 'thestar.co.uk', 'euronews.com', 'rt.com', 'aljazeera.com', 'sputnikglobe.com', 'kyivindependent.com', 'azernews.az', 'chicago.suntimes.com', 'nbcnews.com', 'dailysabah.com', 'heraldscotland.com', 'huffpost.com', 'japannews.yomiuri.co.jp', 'thefederal.com', 'edinburghlive.co.uk', 'cnn.com', 'northernirelandworld.com', '

In [13]:
newspaper_locations = {
    "bnnbreaking.com": "AI generated news",
    "dailymail.co.uk": "United Kingdom",
    "yahoo.com": "United States (global services)",
    "timesofindia.indiatimes.com": "India",
    "hindustantimes.com": "India",
    "thehindu.com": "India",
    "cbsnews.com": "United States",
    "forbes.com": "United States",
    "independent.co.uk": "United Kingdom",
    "ca.news.yahoo.com": "Canada",
    "dailystar.co.uk": "United Kingdom",
    "nzherald.co.nz": "New Zealand",
    "telegraph.co.uk": "United Kingdom",
    "foxnews.com": "United States",
    "bnnbloomberg.ca": "Canada",
    "thedailystar.net": "Bangladesh",
    "globalnews.ca": "Canada",
    "jpost.com": "Israel",
    "theglobeandmail.com": "Canada",
    "irishtimes.com": "Ireland",
    "irishmirror.ie": "Ireland",
    "pressandjournal.co.uk": "United Kingdom (Aberdeen, Scotland)",
    "tribuneindia.com": "India",
    "inews.co.uk": "United Kingdom",
    "indiatimes.com": "India",
    "thestatesman.com": "India",
    "nottinghampost.com": "United Kingdom (Nottingham, England)",
    "abcnews.go.com": "United States",
    "india.com": "India",
    "israelnationalnews.com": "Israel",
    "toronto.citynews.ca": "Canada",
    "bostonglobe.com": "United States (Boston, Massachusetts)",
    "tass.com": "Russia (state news agency)",
    "pressherald.com": "United States (Portland, Maine)",
    "timeslive.co.za": "South Africa",
    "cyprus-mail.com": "Cyprus",
    "thestar.co.uk": "United Kingdom (Sheffield)",
    "euronews.com": "France (Pan-European)",
    "rt.com": "Russia (state-funded)",
    "aljazeera.com": "Qatar (international focus)",
    "sputnikglobe.com": "Russia (state-affiliated)",
    "kyivindependent.com": "Ukraine",
    "azernews.az": "Azerbaijan",
    "chicago.suntimes.com": "United States (Chicago, Illinois)",
    "nbcnews.com": "United States",
    "dailysabah.com": "Turkey",
    "heraldscotland.com": "United Kingdom (Scotland)",
    "huffpost.com": "United States (global editions)",
    "japannews.yomiuri.co.jp": "Japan",
    "thefederal.com": "India",
    "edinburghlive.co.uk": "United Kingdom (Edinburgh, Scotland)",
    "cnn.com": "United States",
    "northernirelandworld.com": "Northern Ireland (UK)",
    "japantimes.co.jp": "Japan",
    "gazettelive.co.uk": "United Kingdom (Teesside, England)",
    "cbc.ca": "Canada",
    "thesun.my": "Malaysia",
    "newsday.com": "United States (Long Island, New York)",
    "eurasiareview.com": "Global (Online publication)",
    "hurriyetdailynews.com": "Turkey",
    "thesouthafrican.com": "South Africa",
    "jewishpress.com": "United States (New York)",
    "abcactionnews.com": "United States (Tampa, Florida)",
    "nysun.com": "United States (New York)",
    "dw.com": "Germany",
    "huffingtonpost.co.uk": "United Kingdom",
    "time.com": "United States",
    "standard.net.au": "Australia",
    "politico.eu": "Belgium (European edition of Politico)",
    "dailypost.co.uk": "United Kingdom (Wales)",
    "northumberlandgazette.co.uk": "United Kingdom (Northumberland, England)",
    "cambridge-news.co.uk": "United Kingdom (Cambridge)",
    "kyivpost.com": "Ukraine",
    "npr.org": "United States",
    "theweek.com": "United States",
    "cnbc.com": "United States",
    "ntv.ca": "Canada (Newfoundland)",
    "the-independent.com": "United States (Grand Island, Nebraska)",
    "africanews.com": "Pan-African (headquartered in Congo-Brazzaville)",
    "turan.az": "Azerbaijan",
    "chicagotribune.com": "United States (Chicago, Illinois)",
    "palestinechronicle.com": "Focus on Palestine (editorial offices often in North America)",
    "romania-insider.com": "Romania",
    "sfstandard.com": "United States (San Francisco, California)",
    "themoscowtimes.com": "Russia/Europe (historically Russia-based)",
    "syrianobserver.com": "Focus on Syria (often run by international or exiled groups)"
    # "None" is omitted as it doesn't map to a real outlet
}
newspaper_countries = {
    "bnnbreaking.com": "AI generated news",   # Keeping as-is from original data
    "dailymail.co.uk": "England",
    "yahoo.com": "United States",
    "timesofindia.indiatimes.com": "India",
    "hindustantimes.com": "India",
    "thehindu.com": "India",
    "cbsnews.com": "United States",
    "forbes.com": "United States",
    "independent.co.uk": "England",
    "ca.news.yahoo.com": "Canada",
    "dailystar.co.uk": "England",
    "nzherald.co.nz": "New Zealand",
    "telegraph.co.uk": "England",
    "foxnews.com": "United States",
    "bnnbloomberg.ca": "Canada",
    "thedailystar.net": "Bangladesh",
    "globalnews.ca": "Canada",
    "jpost.com": "Israel",
    "theglobeandmail.com": "Canada",
    "irishtimes.com": "Ireland",
    "irishmirror.ie": "Ireland",
    "pressandjournal.co.uk": "Scotland",
    "tribuneindia.com": "India",
    "inews.co.uk": "England",
    "indiatimes.com": "India",
    "thestatesman.com": "India",
    "nottinghampost.com": "England",
    "abcnews.go.com": "United States",
    "india.com": "India",
    "israelnationalnews.com": "Israel",
    "toronto.citynews.ca": "Canada",
    "bostonglobe.com": "United States",
    "tass.com": "Russia",
    "pressherald.com": "United States",
    "timeslive.co.za": "South Africa",
    "cyprus-mail.com": "Cyprus",
    "thestar.co.uk": "England",
    "euronews.com": "France",
    "rt.com": "Russia",
    "aljazeera.com": "Qatar",
    "sputnikglobe.com": "Russia",
    "kyivindependent.com": "Ukraine",
    "azernews.az": "Azerbaijan",
    "chicago.suntimes.com": "United States",
    "nbcnews.com": "United States",
    "dailysabah.com": "Turkey",
    "heraldscotland.com": "Scotland",
    "huffpost.com": "United States",
    "japannews.yomiuri.co.jp": "Japan",
    "thefederal.com": "India",
    "edinburghlive.co.uk": "Scotland",
    "cnn.com": "United States",
    "northernirelandworld.com": "Northern Ireland",
    "japantimes.co.jp": "Japan",
    "gazettelive.co.uk": "England",
    "cbc.ca": "Canada",
    "thesun.my": "Malaysia",
    "newsday.com": "United States",
    "eurasiareview.com": "Global",
    "hurriyetdailynews.com": "Turkey",
    "thesouthafrican.com": "South Africa",
    "jewishpress.com": "United States",
    "abcactionnews.com": "United States",
    "nysun.com": "United States",
    "dw.com": "Germany",
    "huffingtonpost.co.uk": "England",
    "time.com": "United States",
    "standard.net.au": "Australia",
    "politico.eu": "Belgium",
    "dailypost.co.uk": "Wales",
    "northumberlandgazette.co.uk": "England",
    "cambridge-news.co.uk": "England",
    "kyivpost.com": "Ukraine",
    "npr.org": "United States",
    "theweek.com": "United States",
    "cnbc.com": "United States",
    "ntv.ca": "Canada",
    "the-independent.com": "United States",
    "africanews.com": "Republic of the Congo",  # Headquartered in Congo-Brazzaville
    "turan.az": "Azerbaijan",
    "chicagotribune.com": "United States",
    "palestinechronicle.com": "United States", # but about palestine
    "romania-insider.com": "Romania",
    "sfstandard.com": "United States",
    "themoscowtimes.com": "Russia", # They now operate out of amsterdan
    "syrianobserver.com": "International" # Often run by international or exiled groups form syria
}
newspaper_details= {
    "bnnbreaking.com": {
        "location": "Likely United States (or global online aggregator)",
        "political_side": "Varies (aggregated content)",
        "summary": "Online news aggregator focusing on breaking news headlines.",
        "popularity": "Less mainstream brand; predominantly online audience.",
        "known_facts": "Not to be confused with BNN Bloomberg; aims at quick ‘breaking’ updates."
    },
    "dailymail.co.uk": {
        "location": "United Kingdom",
        "political_side": "Right-leaning (conservative tabloid)",
        "summary": "A British tabloid known for sensational headlines and wide online reach (MailOnline).",
        "popularity": "Very popular online, one of the most visited English news sites globally.",
        "known_facts": "Founded in 1896; criticized for sensationalism, strong editorial stances."
    },
    "yahoo.com": {
        "location": "United States (global services)",
        "political_side": "Varies widely (aggregator for multiple sources)",
        "summary": "Major web portal offering email, search, and aggregated news content.",
        "popularity": "Historically one of the most visited websites globally.",
        "known_facts": "Founded in 1994; was once a dominant internet brand, still large in news aggregation."
    },
    "timesofindia.indiatimes.com": {
        "location": "India",
        "political_side": "Center to center-right (Times Group editorial line can vary)",
        "summary": "Largest English-language daily in India by circulation.",
        "popularity": "Extremely high readership, major presence online (Times Internet).",
        "known_facts": "Founded in 1838; part of The Times Group (Bennett, Coleman & Co.)."
    },
    "hindustantimes.com": {
        "location": "India",
        "political_side": "Generally center to slightly center-left",
        "summary": "Major English-language daily newspaper in India.",
        "popularity": "One of India’s top newspapers with large readership.",
        "known_facts": "Founded in 1924; HQ in Delhi, also significant presence in Mumbai."
    },
    "thehindu.com": {
        "location": "India",
        "political_side": "Center-left editorial stance",
        "summary": "One of India’s most respected English-language broadsheets.",
        "popularity": "High circulation nationally, especially in southern India.",
        "known_facts": "Founded in 1878; known for in-depth coverage, strong editorial standards."
    },
    "cbsnews.com": {
        "location": "United States",
        "political_side": "Center (traditional US network news approach)",
        "summary": "Major American TV network news division and digital platform.",
        "popularity": "One of the “Big Three” US broadcast networks.",
        "known_facts": "Produces ‘60 Minutes’ and other long-running news programs."
    },
    "forbes.com": {
        "location": "United States",
        "political_side": "Pro-business, typically center-right on economic matters",
        "summary": "Global media company focusing on business, investing, tech, and entrepreneurship.",
        "popularity": "Well-known for rich lists (Forbes 400) and extensive business journalism.",
        "known_facts": "Founded in 1917; famous for its “billionaires list” and entrepreneurial coverage."
    },
    "independent.co.uk": {
        "location": "United Kingdom",
        "political_side": "Center to center-left editorial stance",
        "summary": "British online newspaper (formerly a print paper) known as ‘The Indy.’",
        "popularity": "Well-known in the UK, with a growing international digital readership.",
        "known_facts": "Founded in 1986; went digital-only in 2016."
    },
    "ca.news.yahoo.com": {
        "location": "Canada",
        "political_side": "Varies (aggregator and wire service content)",
        "summary": "Yahoo’s Canada news portal, aggregating news from various sources.",
        "popularity": "Widely used as a news aggregator platform in Canada.",
        "known_facts": "Part of Yahoo’s global network; content from major Canadian outlets."
    },
    "dailystar.co.uk": {
        "location": "United Kingdom",
        "political_side": "Right-leaning tabloid style",
        "summary": "Tabloid newspaper known for celebrity gossip and sensational stories.",
        "popularity": "Popular for entertainment/tabloid coverage in the UK.",
        "known_facts": "Launched in 1978; sister paper is Daily Star Sunday."
    },
    "nzherald.co.nz": {
        "location": "New Zealand",
        "political_side": "Center-right editorial (largest newspaper in NZ)",
        "summary": "New Zealand’s biggest daily newspaper based in Auckland.",
        "popularity": "Dominant daily in NZ with a strong digital presence.",
        "known_facts": "Founded in 1863; owned by NZME."
    },
    "telegraph.co.uk": {
        "location": "United Kingdom",
        "political_side": "Right-leaning (conservative editorial line)",
        "summary": "The Daily Telegraph, a major national broadsheet in the UK.",
        "popularity": "High circulation, wide digital readership internationally.",
        "known_facts": "Founded in 1855; often referred to simply as ‘The Telegraph’."
    },
    "foxnews.com": {
        "location": "United States",
        "political_side": "Right-leaning (conservative)",
        "summary": "24-hour news channel and website with strong conservative commentary.",
        "popularity": "Very high in the U.S., consistently top cable news ratings.",
        "known_facts": "Launched in 1996 by Rupert Murdoch; known for influential prime-time hosts."
    },
    "bnnbloomberg.ca": {
        "location": "Canada",
        "political_side": "Business-focused, generally center",
        "summary": "Business news channel/website in partnership with Bloomberg.",
        "popularity": "Popular among Canadian business audiences.",
        "known_facts": "Formerly ‘Business News Network (BNN)’; rebranded after partnership with Bloomberg."
    },
    "thedailystar.net": {
        "location": "Bangladesh",
        "political_side": "Center-liberal editorial viewpoint",
        "summary": "Largest English-language daily in Bangladesh.",
        "popularity": "Highly influential among English-speaking population domestically and abroad.",
        "known_facts": "Founded in 1991; known for strong editorials and investigative reporting."
    },
    "globalnews.ca": {
        "location": "Canada",
        "political_side": "Generally center (mainstream Canadian broadcaster)",
        "summary": "National English-language TV news and online service from Corus Entertainment.",
        "popularity": "One of the major TV news providers in Canada, along with CTV and CBC.",
        "known_facts": "Operates local stations in various Canadian provinces."
    },
    "jpost.com": {
        "location": "Israel",
        "political_side": "Center-right or centrist from an Israeli perspective",
        "summary": "Jerusalem Post, a major English-language newspaper in Israel.",
        "popularity": "Highly read by English speakers interested in Israeli news.",
        "known_facts": "Founded in 1932 as The Palestine Post; global audience includes Jewish diaspora."
    },
    "theglobeandmail.com": {
        "location": "Canada",
        "political_side": "Center-right editorial stance (historically)",
        "summary": "National daily known for business and political reporting.",
        "popularity": "One of Canada’s most influential newspapers.",
        "known_facts": "Founded in 1844 (as The Globe); has won numerous awards."
    },
    "irishtimes.com": {
        "location": "Ireland",
        "political_side": "Center to liberal editorial line",
        "summary": "Leading Irish broadsheet newspaper covering national and international news.",
        "popularity": "Highly influential in Ireland’s political and cultural discourse.",
        "known_facts": "Founded in 1859; considered a ‘paper of record’ in Ireland."
    },
    "irishmirror.ie": {
        "location": "Ireland",
        "political_side": "Popular tabloid style (affiliated with the UK Mirror group, generally center-left)",
        "summary": "Irish version of the Daily Mirror, covering news, sports, and showbiz.",
        "popularity": "Well-read tabloid in Ireland.",
        "known_facts": "Focus on celebrity, sports, and local Irish news."
    },
    "pressandjournal.co.uk": {
        "location": "United Kingdom (Aberdeen, Scotland)",
        "political_side": "Local/regional coverage (historically leans conservative-liberal mix)",
        "summary": "One of Scotland’s oldest daily newspapers, covering north/northeast Scotland.",
        "popularity": "Regionally dominant in Aberdeenshire.",
        "known_facts": "Founded in 1747; often referred to as ‘the P&J’."
    },
    "tribuneindia.com": {
        "location": "India",
        "political_side": "Center to slightly center-left editorial stance",
        "summary": "The Tribune, English-language daily from North India.",
        "popularity": "Widely read in Punjab, Haryana, Himachal region.",
        "known_facts": "Established in 1881; known for regional coverage and editorial independence."
    },
    "inews.co.uk": {
        "location": "United Kingdom",
        "political_side": "Center to slightly center-left (balanced coverage)",
        "summary": "Website of the ‘i’ newspaper, known for concise coverage.",
        "popularity": "Moderate but growing digital presence in the UK.",
        "known_facts": "Originally a sister publication of The Independent."
    },
    "indiatimes.com": {
        "location": "India",
        "political_side": "Varies (part of The Times Group, often center)",
        "summary": "Portal under The Times Group; covers lifestyle, social media trends, and news.",
        "popularity": "High digital readership; wide content coverage.",
        "known_facts": "Operated by Bennett, Coleman & Co., owners of The Times of India."
    },
    "thestatesman.com": {
        "location": "India",
        "political_side": "Center to center-left (traditional Indian broadsheet)",
        "summary": "One of India’s oldest English newspapers, with roots in West Bengal.",
        "popularity": "Historically influential, moderate circulation now.",
        "known_facts": "Founded in 1875; known for editorial independence and legacy."
    },
    "nottinghampost.com": {
        "location": "United Kingdom (Nottingham, England)",
        "political_side": "Local/regional coverage",
        "summary": "Newspaper/website covering Nottingham and surrounding areas.",
        "popularity": "Widely read locally for news, sports, culture.",
        "known_facts": "Also known historically as the Evening Post."
    },
    "abcnews.go.com": {
        "location": "United States",
        "political_side": "Center to slightly center-left",
        "summary": "ABC News is a major US television news network and digital outlet.",
        "popularity": "Widely consumed nationally, part of the “Big Three” US networks.",
        "known_facts": "Owned by Disney; produces ‘Good Morning America’, ‘World News Tonight’."
    },
    "india.com": {
        "location": "India",
        "political_side": "Generally mainstream/center (mix of lifestyle & news)",
        "summary": "Digital news portal covering entertainment, sports, general news.",
        "popularity": "Popular for general interest and entertainment stories in India.",
        "known_facts": "Joint venture between Zee Entertainment and Penske Media Corporation."
    },
    "israelnationalnews.com": {
        "location": "Israel",
        "political_side": "Right-leaning religious nationalist viewpoint",
        "summary": "English-language site of Arutz Sheva, focusing on Israeli/Jewish affairs.",
        "popularity": "Niche among religious and right-wing audiences in Israel and abroad.",
        "known_facts": "Associated with Religious Zionist movement; also offers radio broadcasts."
    },
    "toronto.citynews.ca": {
        "location": "Canada",
        "political_side": "Local coverage, typically center",
        "summary": "CityNews Toronto covering local GTA news, weather, community issues.",
        "popularity": "Popular among Toronto residents; recognized TV brand in Canada.",
        "known_facts": "Owned by Rogers Media; integrated broadcast and digital platform."
    },
    "bostonglobe.com": {
        "location": "United States (Boston, Massachusetts)",
        "political_side": "Generally center-left editorial stance",
        "summary": "Major regional newspaper in New England with national influence.",
        "popularity": "Highly influential in the Boston area; known nationally for investigative journalism.",
        "known_facts": "Founded in 1872; multiple Pulitzer Prizes."
    },
    "tass.com": {
        "location": "Russia (state news agency)",
        "political_side": "Pro-Russian government stance",
        "summary": "Major Russian news agency with broad coverage and official angles.",
        "popularity": "Russia’s oldest news agency; official statements often come from TASS.",
        "known_facts": "Founded in 1904; previously known as Telegraph Agency of the Soviet Union."
    },
    "pressherald.com": {
        "location": "United States (Portland, Maine)",
        "political_side": "Traditionally center-left editorial stance",
        "summary": "Portland Press Herald, a major daily in Maine.",
        "popularity": "Most widely read newspaper in Maine.",
        "known_facts": "Traces its roots to 1862; known for local reporting."
    },
    "timeslive.co.za": {
        "location": "South Africa",
        "political_side": "Varies, mainstream coverage under Arena Holdings",
        "summary": "Online portal for The Sunday Times and related publications in South Africa.",
        "popularity": "Among the most visited news sites in South Africa.",
        "known_facts": "Focuses on local news, politics, sports, and opinion."
    },
    "cyprus-mail.com": {
        "location": "Cyprus",
        "political_side": "Generally center (English-language daily)",
        "summary": "One of Cyprus’s longest-running English-language newspapers.",
        "popularity": "Primarily read by English-speaking locals and expats in Cyprus.",
        "known_facts": "Founded in 1945; covers local and international news."
    },
    "thestar.co.uk": {
        "location": "United Kingdom (Sheffield)",
        "political_side": "Local/regional coverage",
        "summary": "Regional newspaper covering Sheffield and South Yorkshire.",
        "popularity": "Highly read locally.",
        "known_facts": "Founded in 1887; sister paper to the Sheffield Telegraph."
    },
    "euronews.com": {
        "location": "France (Pan-European)",
        "political_side": "Generally tries to present multiple European perspectives (center)",
        "summary": "European multilingual news network covering European and global news.",
        "popularity": "Widely distributed across Europe in multiple languages.",
        "known_facts": "Launched in 1993; known for multicultural coverage."
    },
    "rt.com": {
        "location": "Russia (state-funded)",
        "political_side": "Pro-Russian government viewpoint",
        "summary": "TV network and online outlet providing news from a Russian perspective.",
        "popularity": "Global brand, especially known for English broadcasts; controversial in Western media.",
        "known_facts": "Formerly ‘Russia Today’; widely considered propaganda by many Western observers."
    },
    "aljazeera.com": {
        "location": "Qatar (international focus)",
        "political_side": "Varies by region; often seen as progressive on Middle East coverage",
        "summary": "Major international news network funded by the Qatari government.",
        "popularity": "Global reach, particularly known for in-depth Middle Eastern coverage.",
        "known_facts": "Launched in 1996; recognized for documentaries and investigative journalism."
    },
    "sputnikglobe.com": {
        "location": "Russia (state-affiliated)",
        "political_side": "Pro-Russian government perspective",
        "summary": "International news agency established by Rossiya Segodnya (Russian gov’t-owned).",
        "popularity": "Known internationally for Russian state viewpoint; widely labeled propaganda in the West.",
        "known_facts": "Launched in 2014 to replace RIA Novosti’s international arm."
    },
    "kyivindependent.com": {
        "location": "Ukraine",
        "political_side": "Pro-democracy, liberal viewpoint",
        "summary": "English-language independent outlet launched by former Kyiv Post journalists.",
        "popularity": "Gained prominence internationally during the Russia-Ukraine conflict.",
        "known_facts": "Founded in 2021; crowdfunded, focusing on Ukrainian politics and war coverage."
    },
    "azernews.az": {
        "location": "Azerbaijan",
        "political_side": "Typically pro-government or neutral from Western perspective",
        "summary": "English-language newspaper focusing on Azerbaijani and regional news.",
        "popularity": "Niche internationally, better known within Azerbaijan.",
        "known_facts": "One of the main English news sources from Azerbaijan."
    },
    "chicago.suntimes.com": {
        "location": "United States (Chicago, Illinois)",
        "political_side": "Generally considered center-left editorially",
        "summary": "Tabloid-style daily newspaper in Chicago.",
        "popularity": "Well-known regionally; second-largest paper in Chicago after the Tribune.",
        "known_facts": "Founded in 1948; known for tabloid format and local focus."
    },
    "nbcnews.com": {
        "location": "United States",
        "political_side": "Center to center-left mainstream broadcast stance",
        "summary": "News division of NBC, a major American broadcast network.",
        "popularity": "One of the largest TV news audiences in the U.S.",
        "known_facts": "Produces ‘Today,’ ‘Nightly News,’ ‘Meet the Press,’ among others."
    },
    "dailysabah.com": {
        "location": "Turkey",
        "political_side": "Pro-government / conservative leaning",
        "summary": "Turkish daily newspaper published in English, close to the ruling AK Party perspective.",
        "popularity": "Known internationally among those who follow Turkish politics.",
        "known_facts": "Launched in 2014; part of the Turkuvaz Media Group."
    },
    "heraldscotland.com": {
        "location": "United Kingdom (Scotland)",
        "political_side": "Historically liberal/centrist",
        "summary": "The Herald is a Scottish broadsheet covering national and international news.",
        "popularity": "One of Scotland’s major newspapers.",
        "known_facts": "Founded in 1783; among the oldest newspapers in the world."
    },
    "huffpost.com": {
        "location": "United States (global editions)",
        "political_side": "Left-leaning overall",
        "summary": "American online news aggregator and blog founded by Arianna Huffington.",
        "popularity": "High in the U.S. digital media space.",
        "known_facts": "Won a Pulitzer Prize in 2012; one of the first digital-only outlets to gain mainstream acceptance."
    },
    "japannews.yomiuri.co.jp": {
        "location": "Japan",
        "political_side": "Generally considered conservative-leaning (Yomiuri Shimbun group)",
        "summary": "English-language version of Yomiuri Shimbun, a major Japanese newspaper.",
        "popularity": "Yomiuri is the highest-circulation newspaper in Japan (in Japanese).",
        "known_facts": "One of Japan’s oldest and most influential newspapers."
    },
    "thefederal.com": {
        "location": "India",
        "political_side": "Generally center to center-left",
        "summary": "Digital news platform focusing on Indian politics, economy, and society.",
        "popularity": "Smaller, newer digital outlet but growing audience.",
        "known_facts": "Offers explanatory journalism and analysis on Indian current affairs."
    },
    "edinburghlive.co.uk": {
        "location": "United Kingdom (Edinburgh, Scotland)",
        "political_side": "Local/regional coverage",
        "summary": "Digital news outlet covering Edinburgh and surrounding areas.",
        "popularity": "Locally popular for community news, events, local politics.",
        "known_facts": "Likely part of the Reach PLC group or similar local media."
    },
    "cnn.com": {
        "location": "United States",
        "political_side": "Center-left to center (often described as leaning left in US context)",
        "summary": "24-hour cable news channel and website, pioneering round-the-clock news coverage.",
        "popularity": "Internationally recognized with large global viewership.",
        "known_facts": "Founded by Ted Turner in 1980; known for major breaking news coverage."
    },
    "northernirelandworld.com": {
        "location": "Northern Ireland (UK)",
        "political_side": "Local/regional coverage (varied political environment)",
        "summary": "Covers regional news, politics, and community updates in Northern Ireland.",
        "popularity": "Locally recognized for community-level journalism.",
        "known_facts": "Likely part of JPIMedia or a similar local news group."
    },
    "japantimes.co.jp": {
        "location": "Japan",
        "political_side": "Historically liberal-leaning (English-language perspective)",
        "summary": "Oldest English-language newspaper in Japan, covering local and global news.",
        "popularity": "Widely read by English-speaking expatriates and international readers in Japan.",
        "known_facts": "Founded in 1897; focuses on Japanese politics, culture for English readers."
    },
    "gazettelive.co.uk": {
        "location": "United Kingdom (Teesside, England)",
        "political_side": "Local/regional coverage",
        "summary": "Online news for Teesside region, affiliated with The Gazette newspaper.",
        "popularity": "Locally significant in Northeast England.",
        "known_facts": "Focuses on local crime, community, sports, and events."
    },
    "cbc.ca": {
        "location": "Canada",
        "political_side": "Generally perceived as center-left (public broadcaster)",
        "summary": "Canada’s national public broadcaster, offering TV, radio, and online news.",
        "popularity": "Very high in Canada, widely trusted public media outlet.",
        "known_facts": "Federally funded; oldest existing broadcasting network in Canada."
    },
    "thesun.my": {
        "location": "Malaysia",
        "political_side": "Generally pro-government or center (Malaysia’s media environment varies)",
        "summary": "Free daily newspaper in Malaysia, widely distributed in urban areas.",
        "popularity": "Significant readership as a free paper.",
        "known_facts": "Operated by Sun Media; focuses on local/national news."
    },
    "newsday.com": {
        "location": "United States (Long Island, New York)",
        "political_side": "Moderate/centrist editorial viewpoint historically",
        "summary": "Daily newspaper primarily covering Long Island and the NYC region.",
        "popularity": "Widely read in Long Island; historically among top US papers by circulation.",
        "known_facts": "Founded in 1940; multiple Pulitzer Prizes for local reporting."
    },
    "eurasiareview.com": {
        "location": "Global (Online publication)",
        "political_side": "Varies (broad coverage on Eurasia/global issues)",
        "summary": "Independent journal focusing on geopolitical, social, and economic analysis.",
        "popularity": "Niche among policy analysts and those interested in Eurasian affairs.",
        "known_facts": "Publishes commentary and analysis from various authors/think-tanks."
    },
    "hurriyetdailynews.com": {
        "location": "Turkey",
        "political_side": "Historically more secular, can vary with ownership changes",
        "summary": "English-language Turkish newspaper covering national and regional news.",
        "popularity": "One of Turkey’s oldest English dailies, read by expats/diplomats.",
        "known_facts": "Part of Hürriyet group; editorial stance shifts with Turkey’s media climate."
    },
    "thesouthafrican.com": {
        "location": "South Africa",
        "political_side": "Generally neutral but focuses on SA diaspora news",
        "summary": "Online portal covering South African news, sports, culture, aimed also at expats.",
        "popularity": "Popular among South Africans abroad and at home for updates.",
        "known_facts": "Launched in 2003 in London; now also operating in South Africa."
    },
    "jewishpress.com": {
        "location": "United States (New York)",
        "political_side": "Right-leaning Orthodox Jewish perspective",
        "summary": "Weekly Jewish newspaper/online site focusing on religious and Israel-related news.",
        "popularity": "Niche among Orthodox and conservative Jewish communities.",
        "known_facts": "Founded in 1960; covers Jewish world news and religious commentary."
    },
    "abcactionnews.com": {
        "location": "United States (Tampa, Florida)",
        "political_side": "Generally center (local ABC affiliate)",
        "summary": "Local affiliate of ABC network covering Tampa Bay news.",
        "popularity": "Regionally popular in the Tampa Bay area.",
        "known_facts": "Part of the E. W. Scripps Company; focuses on local news, weather, consumer issues."
    },
    "nysun.com": {
        "location": "United States (New York)",
        "political_side": "Historically conservative or libertarian-leaning editorial page",
        "summary": "Online revival of the former daily The New York Sun.",
        "popularity": "More niche than mainstream, known for editorial commentary.",
        "known_facts": "Active in various forms since 2002; emphasizes political/cultural commentary."
    },
    "dw.com": {
        "location": "Germany",
        "political_side": "Generally center, public broadcaster perspective",
        "summary": "Deutsche Welle is Germany’s international broadcaster with global coverage.",
        "popularity": "Well-regarded internationally for factual news and cultural programs.",
        "known_facts": "Funded by the German government; provides content in 30+ languages."
    },
    "huffingtonpost.co.uk": {
        "location": "United Kingdom",
        "political_side": "Generally left-liberal commentary (UK edition)",
        "summary": "UK edition of HuffPost, focusing on British news, politics, entertainment.",
        "popularity": "Notable online presence in the UK.",
        "known_facts": "Owned by BuzzFeed (previously Verizon Media); known for blogging platform origins."
    },
    "time.com": {
        "location": "United States",
        "political_side": "Center-liberal mainstream magazine",
        "summary": "Long-running weekly news magazine covering global events, politics, culture.",
        "popularity": "Internationally recognized brand.",
        "known_facts": "Founded in 1923; famous for ‘Time Person of the Year’ cover."
    },
    "standard.net.au": {
        "location": "Australia",
        "political_side": "Local/regional (part of Australian community press)",
        "summary": "Likely The Warrnambool Standard or a regional ‘Standard’ publication in Australia.",
        "popularity": "Regional coverage focusing on local events, sports, and issues.",
        "known_facts": "Serves local Australian communities."
    },
    "politico.eu": {
        "location": "Belgium (European edition of Politico)",
        "political_side": "Center to center-left (policy-focused coverage)",
        "summary": "Focuses on EU institutions, European politics, and policy news.",
        "popularity": "Influential among European policymakers, journalists, and lobbyists.",
        "known_facts": "Launched as part of Politico’s expansion beyond the U.S."
    },
    "dailypost.co.uk": {
        "location": "United Kingdom (Wales)",
        "political_side": "Local/regional coverage",
        "summary": "Daily newspaper covering the North Wales region.",
        "popularity": "Popular locally in North Wales.",
        "known_facts": "Focus on local community issues, events, and sports."
    },
    "northumberlandgazette.co.uk": {
        "location": "United Kingdom (Northumberland, England)",
        "political_side": "Local/regional coverage",
        "summary": "Local paper focusing on Northumberland area news and events.",
        "popularity": "Popular in its coverage area.",
        "known_facts": "Long history of rural and community reporting."
    },
    "cambridge-news.co.uk": {
        "location": "United Kingdom (Cambridge)",
        "political_side": "Local/regional coverage (no strong national affiliation)",
        "summary": "Local newspaper/website covering the Cambridge region.",
        "popularity": "Locally popular for community and regional updates.",
        "known_facts": "Established in 1888; focuses on local politics, events, and schools."
    },
    "kyivpost.com": {
        "location": "Ukraine",
        "political_side": "Generally pro-Western, independent-liberal stance",
        "summary": "Ukraine’s oldest English-language newspaper, covering national/global news.",
        "popularity": "Key resource for English-language reporting on Ukraine.",
        "known_facts": "Founded in 1995; known for editorial independence coverage of Ukrainian politics."
    },
    "npr.org": {
        "location": "United States",
        "political_side": "Center-left public media perspective",
        "summary": "National Public Radio: non-profit media organization with news/cultural programs.",
        "popularity": "High among U.S. public radio audiences; respected for in-depth reporting.",
        "known_facts": "Publicly funded, with member stations across the US."
    },
    "theweek.com": {
        "location": "United States",
        "political_side": "Generally center (aggregates perspectives from left and right)",
        "summary": "Weekly news magazine that curates and summarizes stories from multiple angles.",
        "popularity": "Fairly popular for condensed news analysis/opinion.",
        "known_facts": "Started in the UK (1995), then US edition (2001); known for bridging differing viewpoints."
    },
    "cnbc.com": {
        "location": "United States",
        "political_side": "Business-focused, generally center",
        "summary": "24-hour business news channel/website under NBCUniversal.",
        "popularity": "Popular among business viewers and stock market followers.",
        "known_facts": "Launched in 1989; focuses on real-time financial market coverage."
    },
    "ntv.ca": {
        "location": "Canada (Newfoundland)",
        "political_side": "Local/regional coverage (private station)",
        "summary": "Canada’s oldest privately owned TV station, focusing on Newfoundland news.",
        "popularity": "Dominant in Newfoundland; lesser-known outside the province.",
        "known_facts": "Launched in 1955; strong local presence in Atlantic Canada."
    },
    "the-independent.com": {
        "location": "United States (Grand Island, Nebraska)",
        "political_side": "Local/regional coverage (no strong national affiliation)",
        "summary": "Grand Island Independent, a local daily in Nebraska.",
        "popularity": "Locally well-known; not widely known nationwide.",
        "known_facts": "Focuses on local reporting for central Nebraska communities."
    },
    "africanews.com": {
        "location": "Pan-African (headquartered in Congo-Brazzaville)",
        "political_side": "Varies (factual/news-oriented, affiliated with Euronews)",
        "summary": "Pan-African multilingual news outlet covering African and world news.",
        "popularity": "Gaining traction across African nations and internationally.",
        "known_facts": "Launched by Euronews in 2016; offers coverage in English and French."
    },
    "turan.az": {
        "location": "Azerbaijan",
        "political_side": "Independent stance, sometimes critical of government",
        "summary": "News agency providing local/regional coverage in Azerbaijan.",
        "popularity": "Niche but recognized as one of few independent sources in Azerbaijan.",
        "known_facts": "Has faced governmental pressure for investigative reporting."
    },
    "chicagotribune.com": {
        "location": "United States (Chicago, Illinois)",
        "political_side": "Historically center-right editorial page, though varied",
        "summary": "One of the largest newspapers in the Midwest, founded mid-19th century.",
        "popularity": "Highly influential in Illinois and the broader Midwest.",
        "known_facts": "Owned by Tribune Publishing; known for political endorsements/investigations."
    },
    "palestinechronicle.com": {
        "location": "Focus on Palestine (editorial offices often in North America)",
        "political_side": "Pro-Palestinian perspective, progressive-left stance",
        "summary": "Online publication focusing on Palestinian affairs, Middle East commentary.",
        "popularity": "Niche among readers interested in Middle East conflict/advocacy.",
        "known_facts": "Founded by Palestinian author Ramzy Baroud; features opinion pieces."
    },
    "romania-insider.com": {
        "location": "Romania",
        "political_side": "Generally neutral/center for English-speaking expats and business community",
        "summary": "Online portal covering Romanian news in English.",
        "popularity": "Popular among foreigners, investors, and diaspora seeking English news on Romania.",
        "known_facts": "Focuses on business, economic, and cultural updates in Romania."
    },
    "sfstandard.com": {
        "location": "United States (San Francisco, California)",
        "political_side": "Local/regional coverage in a generally progressive city",
        "summary": "Online outlet covering local SF news, culture, and politics.",
        "popularity": "Known regionally in the Bay Area’s digital news sphere.",
        "known_facts": "Founded relatively recently as a digital-first news platform."
    },
    "themoscowtimes.com": {
        "location": "Russia/Europe (historically Russia-based)",
        "political_side": "Independent, often critical of Russian government",
        "summary": "English-language newspaper covering Russian affairs.",
        "popularity": "Relied upon by expats and international audiences for Russia news in English.",
        "known_facts": "Founded in 1992; relocated much editorial work outside Russia due to restrictions."
    },
    "syrianobserver.com": {
        "location": "Focus on Syria (often run internationally or by exiled groups)",
        "political_side": "Opposition-friendly or independent perspective",
        "summary": "Daily news site translating and aggregating Syrian press, focusing on politics and civil society.",
        "popularity": "Specialized readership for Syrian conflict and Middle East watchers.",
        "known_facts": "Provides English translations of local Syrian publications with commentary on conflict."
    }
    # Skipping "None" entry.
}

### Create a new csv that has one row for each newspaper

In [16]:
import polars as pl
import os
import csv

# Path to the output CSV file and header definition
output_file = "newspapers2024.csv"
header = [
    "Publisher",
    "Year",
    "Political Alignment",
    "Articles",
    "Article Count",
    "Corpus",
    "Corpus Word Count",
    "Unique Word Count"
]

# Determine if we need to write the header (i.e. file doesn't exist yet)
write_header = not os.path.exists(output_file)

# Open the output file in append mode
with open(output_file, "a", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)

    # Write header if file is new
    if write_header:
        writer.writerow(header)

    # Iterate over each publisher in the list
    for publisher in publishers:
        # Lazily scan the CSV and filter for the specific publisher, selecting only the 'plain_text' column
        df_lazy = pl.scan_csv("2024.csv").filter(pl.col("publisher") == publisher).select("plain_text")
        filtered_df = df_lazy.collect()  # Materialize the filtered data

        # Initialize the articles list by extracting plain_text values
        articles = filtered_df["plain_text"].to_list()
        article_count = len(articles)

        # Set constant values
        year = 2024
        political_alignment = None
        corpus = None
        corpus_word_count = None
        unique_word_count = None

        # Append the new row to the CSV file
        writer.writerow([
            publisher,
            year,
            political_alignment,
            pl.lit(articles),
            article_count,
            corpus,
            corpus_word_count,
            unique_word_count
        ])
        print(f"Data for '{publisher}' appended successfully.")
        break


Data for 'abcactionnews.com' appended successfully.
Data for 'abcnews.go.com' appended successfully.
Data for 'africanews.com' appended successfully.
Data for 'aljazeera.com' appended successfully.
Data for 'azernews.az' appended successfully.
Data for 'bnnbloomberg.ca' appended successfully.
Data for 'bnnbreaking.com' appended successfully.
Data for 'bostonglobe.com' appended successfully.
Data for 'ca.news.yahoo.com' appended successfully.
Data for 'cambridge-news.co.uk' appended successfully.
Data for 'cbc.ca' appended successfully.
Data for 'cbsnews.com' appended successfully.
Data for 'chicago.suntimes.com' appended successfully.
Data for 'chicagotribune.com' appended successfully.
Data for 'cnbc.com' appended successfully.
Data for 'cnn.com' appended successfully.
Data for 'cyprus-mail.com' appended successfully.
Data for 'dailymail.co.uk' appended successfully.
Data for 'dailypost.co.uk' appended successfully.
Data for 'dailysabah.com' appended successfully.
Data for 'dailystar.

### Read the CSV file and display the first few rows

In [17]:
import polars as pl
# Lazy scan the CSV file
df_lazy = pl.scan_csv("newspapers2024.csv")
sample_df = df_lazy.head(5).collect() # only first 5 rows
print(sample_df)

shape: (5, 8)
┌──────────────┬──────┬──────────────┬──────────────┬─────────┬────────┬─────────────┬─────────────┐
│ Publisher    ┆ Year ┆ Political    ┆ Articles     ┆ Article ┆ Corpus ┆ Corpus Word ┆ Unique Word │
│ ---          ┆ ---  ┆ Alignment    ┆ ---          ┆ Count   ┆ ---    ┆ Count       ┆ Count       │
│ str          ┆ i64  ┆ ---          ┆ str          ┆ ---     ┆ str    ┆ ---         ┆ ---         │
│              ┆      ┆ str          ┆              ┆ i64     ┆        ┆ str         ┆ str         │
╞══════════════╪══════╪══════════════╪══════════════╪═════════╪════════╪═════════════╪═════════════╡
│ abcactionnew ┆ 2024 ┆ null         ┆ ['TAMPA,     ┆ 3312    ┆ null   ┆ null        ┆ null        │
│ s.com        ┆      ┆              ┆ Fla. — Lyme  ┆         ┆        ┆             ┆             │
│              ┆      ┆              ┆ disease i…   ┆         ┆        ┆             ┆             │
│ abcnews.go.c ┆ 2024 ┆ null         ┆ ['Last round ┆ 8643    ┆ null   ┆ null