In [None]:
import requests
from bs4 import BeautifulSoup
import time
import json

In [None]:
def get_articles(base_url, max_pages=115):
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_articles = []
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}&page={page}"
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching page {page}: {response.status_code}")
            break
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles_list = soup.find_all('div', class_='views-row')
        print(f"Found {len(articles_list)} articles on page {page}.")
        
        for article in articles_list:
            title_tag = article.find('h2')
            link_tag = title_tag.find('a') if title_tag else None
            date_tag = article.find('span', class_='datespan')
            summary_tag = article.find('p')
            
            if link_tag:
                title = link_tag.text.strip()
                article_url = 'https://www.understandingwar.org' + link_tag['href']
                date = date_tag.text.strip() if date_tag else "Unknown date"
                summary = summary_tag.text.strip() if summary_tag else "No summary available"
                
                print(f"Fetching article: {title} ({article_url})")
                
                text = get_article_text(article_url)
                all_articles.append({'title': title, 'url': article_url, 'date': date, 'summary': summary})
                time.sleep(2)
    
    return all_articles

In [None]:
def get_article_text(article_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(article_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Error fetching article: {article_url}")
        return "Error fetching article"
    
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find('div', class_='field-item')
    
    return content.text.strip() if content else "No content found"

if __name__ == "__main__":
    base_url = "https://www.understandingwar.org/publications?type%5B0%5D=backgrounder&type%5B1%5D=map&type%5B2%5D=other_work&type%5B3%5D=report&tid%5B0%5D=300&field_lastname_value=&sort_by=created&sort_order=DESC"
    articles = get_articles(base_url, max_pages=114)

    if not articles:
        print("No articles found.")
    
    with open("articles.json", "w", encoding="utf-8") as f:
        json.dump(articles, f, ensure_ascii=False, indent=4)

    print(f"Saved {len(articles)} articles to articles.json")


Found 10 articles on page 1.
Fetching article: Russian Offensive Campaign Assessment, March 17, 2025 (https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-march-17-2025)
Fetching article: Russian Offensive Campaign Assessment, March 16, 2025 (https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-march-16-2025)
Fetching article: Russian Offensive Campaign Assessment, March 15, 2025 (https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-march-15-2025)
Fetching article: Russian Offensive Campaign Assessment, March 14, 2025 (https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-march-14-2025)
Fetching article: Russian Offensive Campaign Assessment, March 13, 2025 (https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-march-13-2025)
Fetching article: Russian Offensive Campaign Assessment, March 12, 2025 (https://www.understandingwar.org/backg