In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import csv
import time

In [7]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        return None, ""
    soup = BeautifulSoup(response.text, 'html.parser')

    # target the div containing the article text
    article_text = soup.find('div', class_='entry-content clear')
    
    if article_text:
        # extract the text, using 'separator' to add spaces where tags are removed
        article_text = article_text.get_text(separator=' ', strip=True)
    else:
        article_text = "Could not find the article text."
        print(f"Could not extract text for: {url}")
    
    # separately finding and extracting the article publication date from the 'entry-meta' div
    date_container = soup.find('div', class_='entry-meta')
    article_date_text = date_container.get_text(strip=True) if date_container else None
    article_date = find_date_in_text(article_date_text) if article_date_text else None

    return article_date, article_text

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=3, end_page=12):
    """Crawl the articles from page 3 to page 12 that were published between the start and end dates."""
    """Save the article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Link', 'Text'])  # Header row

        for page in range(start_page, end_page + 1):  # Loop from page 3 to page 12
            print(f"Scraping page {page}...")
            time.sleep(1)  # Wait for 1 second before making each request to be polite
            url = f"{base_url}{page}/"  # Append the page number to the base URL
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to fetch {url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            # 'entry-title ast-blog-single-element' is the class for article titles
            article_links = soup.findAll('h2', {'class': 'entry-title ast-blog-single-element'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                article_url = link.find('a')['href']
                article_date, article_text = scrape_article(article_url)
                
                if article_date and start_date <= article_date <= end_date:
                    writer.writerow([link.text.strip(), article_date.strftime('%B %d, %Y'), article_url, article_text])
                    print(f"Saved article: {link.text.strip()}")

# execution
base_url = 'https://migrantsrights.org.uk/category/blog/page/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)


Scraping page 3...
Saved article: New migration measures reinforce classism and racism
Scraping page 4...
Saved article: Digital Hostile Environment: Passports and Facial Recognition
Saved article: Deprivation of citizenship is Islamophobic
Saved article: MAP: Third Workshop
Saved article: Digitisation of the UK border: EVisas
Saved article: Data-sharing and immigration enforcement
Saved article: Suella’s horrendous legacy: her worst moments
Saved article: Digitisation of the UK border: Electronic Travel Authorisation (ETA)
Saved article: International Day Against Fascism + Antisemitism
Saved article: Silent genocides: Congo, Armenia + Sudan
Saved article: Desensitisation to the Global Majority’s Suffering
Scraping page 5...
Saved article: Islamophobia Awareness Month 2023
Saved article: Blog: Bibby Stockholm
Saved article: “We are pioneers and innovators”.
Saved article: “We made ourselves strong”.
Saved article: “Celebrating our Blackness in its entirety”.
Saved article: Right to Wor