In [15]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import csv
import time
import os

Scraping blog articles for **Migrants' Rights Network**:

In [7]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        return None, ""
    soup = BeautifulSoup(response.text, 'html.parser')

    # target the div containing the article text
    article_text = soup.find('div', class_='entry-content clear')
    
    if article_text:
        # extract the text, using 'separator' to add spaces where tags are removed
        article_text = article_text.get_text(separator=' ', strip=True)
    else:
        article_text = "Could not find the article text."
        print(f"Could not extract text for: {url}")
    
    # separately finding and extracting the article publication date from the 'entry-meta' div
    date_container = soup.find('div', class_='entry-meta')
    article_date_text = date_container.get_text(strip=True) if date_container else None
    article_date = find_date_in_text(article_date_text) if article_date_text else None

    return article_date, article_text

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=3, end_page=12):
    """Crawl the articles from page 3 to page 12 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Organisation', 'Title', 'Date', 'Link', 'Text'])  # Header row

        for page in range(start_page, end_page + 1):  # Loop from page 3 to page 12
            print(f"Scraping page {page}...")
            time.sleep(1)  # Wait for 1 second before making each request to be polite
            url = f"{base_url}{page}/"  # Append the page number to the base URL
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to fetch {url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            # 'entry-title ast-blog-single-element' is the class for article titles
            article_links = soup.findAll('h2', {'class': 'entry-title ast-blog-single-element'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                article_url = link.find('a')['href']
                article_date, article_text = scrape_article(article_url)
                
                if article_date and start_date <= article_date <= end_date:
                    writer.writerow(["Migrants' Rights Network", link.text.strip(), article_date.strftime('%d %B %Y'), article_url, article_text])
                    print(f"Saved article: {link.text.strip()}")

# execution
base_url = 'https://migrantsrights.org.uk/category/blog/page/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)


Scraping page 3...
Saved article: New migration measures reinforce classism and racism
Scraping page 4...
Saved article: Digital Hostile Environment: Passports and Facial Recognition
Saved article: Deprivation of citizenship is Islamophobic
Saved article: MAP: Third Workshop
Saved article: Digitisation of the UK border: EVisas
Saved article: Data-sharing and immigration enforcement
Saved article: Suella’s horrendous legacy: her worst moments
Saved article: Digitisation of the UK border: Electronic Travel Authorisation (ETA)
Saved article: International Day Against Fascism + Antisemitism
Saved article: Silent genocides: Congo, Armenia + Sudan
Saved article: Desensitisation to the Global Majority’s Suffering
Scraping page 5...
Saved article: Islamophobia Awareness Month 2023
Saved article: Blog: Bibby Stockholm
Saved article: “We are pioneers and innovators”.
Saved article: “We made ourselves strong”.
Saved article: “Celebrating our Blackness in its entirety”.
Saved article: Right to Wor

Scraping blog articles for **Freedom from Torture**:

In [18]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'date month year' from the text."""
    pattern = re.compile(r'\d{1,2} [a-zA-Z]+ \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, "", ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_content = soup.find('div', {'class': 'last-unspace'})
    article_text = article_content.get_text(separator=' ', strip=True) if article_content else "Could not find the article text."
    
    date_container = soup.find('div', class_='field--field-published-date--item')
    article_date_text = date_container.get_text(strip=True) if date_container else None
    article_date = find_date_in_text(article_date_text) if article_date_text else None

    return article_date, article_text

def strip_date_from_title(title_with_date):
    """Delete the date at the start of the title and any following newlines/spaces."""
    pattern = re.compile(r'^\d{1,2} [a-zA-Z]+ \d{4}\s*[\n\r\s]*')
    # Replace the matched date and following whitespace/newlines with an empty string
    title_without_date = re.sub(pattern, '', title_with_date).strip()
    return title_without_date

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=0, end_page=0):
    """Crawl the articles on page0 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            print(f"Scraping page {page}...")
            time.sleep(1)
            page_url = f"{base_url}?page={page}" 
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            article_links = soup.findAll('div', {'class': 'mb-4'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                a_tag = link.find('a')
                if a_tag and a_tag['href']:
                    article_url = a_tag['href']
                    # Check if the URL is relative and prepend the base URL if necessary
                    if article_url.startswith('/'):
                        article_url = f"https://www.freedomfromtorture.org{article_url}"
                    article_date, article_text = scrape_article(article_url)
                                
                    if article_date and start_date <= article_date <= end_date:
                        article_title_with_date = link.text.strip()  # Original text containing both title and date
                        article_title = strip_date_from_title(article_title_with_date)  # Stripped title
                        writer.writerow(["Freedom from Torture", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                        print(f"Saved article: {article_title}")


# execution
base_url = 'https://www.freedomfromtorture.org/news'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)



Scraping page 0...
Saved article: Supreme Court rules plan to send refugees to Rwanda ‘unlawful’
Saved article: Freedom from Torture’s statement on Israel and the Occupied Palestinian Territories
Saved article: Bibby Stockholm: Why refugees and torture survivors shouldn’t be housed on floating prisons
Saved article: My heart aches for young women imprisoned and suffering in Iran today
Saved article: Sunak’s heartless proposal to force refugees to live on barges is a mental and physical health catastrophe waiting to happen
Saved article: 'Illegal Migration' Act - Everything you need to know
Saved article: Refugee Ban Bill will effectively extinguish the right to seek asylum in the UK
Saved article: Where does torture happen around the world?
Saved article: What is torture?
Saved article: Plan to send refugees to Rwanda ‘unlawful’ – A vital win as the Court of Appeal rules on the Government’s plan
Saved article: Banned: A peaceful protest to stand up for refugees
Saved article: Freedom f

Scraping blog articles for **Rainbow Migration**:

In [24]:

def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'date/month/year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d/%m/%Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    """Investigate different tags and classes to find the main article text."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return "", ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_text = ""
    
    # Check for 'post-content style-light double-bottom-padding' container
    main_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if main_content:
        # Extract all text from <p> and <h3> tags within this container
        for segment in main_content.find_all(['p', 'h3']):
            article_text += segment.get_text(strip=True) + " "
    
    # Check for 'uncode_text_column' containers
    uncode_columns = soup.findAll('div', class_='uncode_text_column')
    for column in uncode_columns:
        # Extract all text from <p> and <h3> tags within each 'uncode_text_column' container
        for segment in column.find_all(['p', 'h3']):
            article_text += segment.get_text(strip=True) + " "
    
    return article_text.strip()

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=3, end_page=11):
    """Crawl the articles between page3 and page11 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        # Loop through the pages from start_page to end_page
        for page_num in range(start_page, end_page + 1):
            page_url = f"{base_url}?upage={page_num}"
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            article_links = soup.findAll('div', {'class': 't-entry'})

            # Loop through the article links on the page
            for link in article_links:
                time.sleep(1)
                a_tag = link.find('a')
                if not a_tag or 'href' not in a_tag.attrs:
                    continue
                article_url = a_tag['href']
                article_text = scrape_article(article_url)
                # Extract the date from the 't-entry-date' span if it exists
                date_span = link.find('span', class_='t-entry-date')
                if date_span:
                    date_text = date_span.text
                    article_date = find_date_in_text(date_text)
                else:
                    article_date = None

                if article_date and start_date <= article_date <= end_date:
                    article_title = a_tag.text.strip() 
                    writer.writerow(["Rainbow Migration", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                    print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.rainbowmigration.org.uk/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Stop the Rwanda Bill!
Saved article: “I will have to hide my identity in my own room”
Saved article: Apply for a trainee solicitor position at Wilson’s
Saved article: Joint civil society statement on the Supreme Court ruling on the Rwanda Plan
Saved article: Joint Statement: LGBTQI+ people seeking safety here will not be sent to Rwanda
Saved article: We are hiring: Legal and Support Services Assistant
Saved article: A video guide to intersex asylum claims
Saved article: LGBTQI+ people shouldn’t be moved to a floating prison
Saved article: Enough: Trans people are people too
Saved article: Letter to the PM: Respect the lives of LGBTQI+ people and women seeking asylum in the UK
Saved article: Our response to the Home Secretary who thinks “being gay isn’t reason enough for asylum”
Saved article: “To have our human rights respected and protected has changed so many lives forever.”
Saved article: Brook House Inquiry findings: A gay man faced verbal homophobic abuse and was ou

Scraping *latest news* for **Women for Refugee Women**:

In [30]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 't-entry'})

        for link in article_links:
            time.sleep(1)
            a_tag = link.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue
            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            # Extract the date from the 't-entry-date' span if it exists
            date_span = link.find('span', class_='t-entry-date')
            if date_span:
                date_text = date_span.text
                article_date = find_date_in_text(date_text)
            else:
                article_date = None

            if article_date and start_date <= article_date <= end_date:
                article_title = a_tag.text.strip()
                writer.writerow(["Women for Refugee Women", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.refugeewomen.co.uk/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: The Supreme Court ruled the Rwanda Plan unlawful
Saved article: Suella Braverman’s speech: How it harms women and LGBTQ+ people
Saved article: A huge congratulations to Agnes and Loraine for winning the Pioneer 20 award!
Saved article: Passage of the ‘Illegal’ Migration Act
Saved article: Great news! The 72-hour time limit on the detention of pregnant women is maintained
Saved article: Putting Ourselves in the Picture: Rainbow Sisters Virtual Gallery!
Saved article: Campaign win! All legal advice surgeries in immigration detention must now take place face-to-face.
Saved article: Updated May 2023 – Joint briefing on the ‘Illegal Migration Bill’: Take action against the proposed new powers to detain pregnant women indefinitely
Saved article: We are recruiting a Campaigns and Advocacy Manager!
Saved article: See Us, Believe Us, Stand With Us
Saved article: Our Year: 2022


Scraping *blog articles* for **Women for Refugee Women**:

In [31]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container. This includes text within <span> tags.
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 't-entry'})

        for link in article_links:
            time.sleep(1)
            a_tag = link.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue
            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            # Extract the date from the 't-entry-date' span if it exists
            date_span = link.find('span', class_='t-entry-date')
            if date_span:
                date_text = date_span.text
                article_date = find_date_in_text(date_text)
            else:
                article_date = None

            if article_date and start_date <= article_date <= end_date:
                article_title = a_tag.text.strip()
                writer.writerow(["Women for Refugee Women", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.refugeewomen.co.uk/news/blog/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Article content container not found.
Saved article: Welcome Every Woman: A Festive Celebration
Saved article: Celebrating this year’s empowerment activities
Saved article: Hiba’s Story: Pride 2023
Saved article: Ange’s Story: Pride 2023
Saved article: A Collaboration between Women for Refugee Women and The Five Points Brewing Co.
Saved article: Guest blog: Our Mothers Ourselves
Saved article: We held our first in-person Christmas party since 2019
