In [2]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import csv
import time
import os

Scraping articles from **Migrants' Rights Network**:

In [7]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date form of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        return None, ""
    soup = BeautifulSoup(response.text, 'html.parser')

    # target the div containing the article text
    article_text = soup.find('div', class_='entry-content clear')
    
    if article_text:
        # extract the text, using 'separator' to add spaces where tags are removed
        article_text = article_text.get_text(separator=' ', strip=True)
    else:
        article_text = "Could not find the article text."
        print(f"Could not extract text for: {url}")
    
    # separately finding and extracting the article publication date from the 'entry-meta' div
    date_container = soup.find('div', class_='entry-meta')
    article_date_text = date_container.get_text(strip=True) if date_container else None
    article_date = find_date_in_text(article_date_text) if article_date_text else None

    return article_date, article_text

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=3, end_page=12):
    """Crawl the articles from page 3 to page 12 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Organisation', 'Title', 'Date', 'Link', 'Text'])  # Header row

        for page in range(start_page, end_page + 1):  # Loop from page 3 to page 12
            print(f"Scraping page {page}...")
            time.sleep(1)  # Wait for 1 second before making each request to be polite
            url = f"{base_url}{page}/"  # Append the page number to the base URL
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to fetch {url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            # 'entry-title ast-blog-single-element' is the class for article titles
            article_links = soup.findAll('h2', {'class': 'entry-title ast-blog-single-element'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                article_url = link.find('a')['href']
                article_date, article_text = scrape_article(article_url)
                
                if article_date and start_date <= article_date <= end_date:
                    writer.writerow(["Migrants' Rights Network", link.text.strip(), article_date.strftime('%d %B %Y'), article_url, article_text])
                    print(f"Saved article: {link.text.strip()}")

# execution
base_url = 'https://migrantsrights.org.uk/category/blog/page/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)


Scraping page 3...
Saved article: New migration measures reinforce classism and racism
Scraping page 4...
Saved article: Digital Hostile Environment: Passports and Facial Recognition
Saved article: Deprivation of citizenship is Islamophobic
Saved article: MAP: Third Workshop
Saved article: Digitisation of the UK border: EVisas
Saved article: Data-sharing and immigration enforcement
Saved article: Suella’s horrendous legacy: her worst moments
Saved article: Digitisation of the UK border: Electronic Travel Authorisation (ETA)
Saved article: International Day Against Fascism + Antisemitism
Saved article: Silent genocides: Congo, Armenia + Sudan
Saved article: Desensitisation to the Global Majority’s Suffering
Scraping page 5...
Saved article: Islamophobia Awareness Month 2023
Saved article: Blog: Bibby Stockholm
Saved article: “We are pioneers and innovators”.
Saved article: “We made ourselves strong”.
Saved article: “Celebrating our Blackness in its entirety”.
Saved article: Right to Wor

Scraping articles from **Freedom from Torture**:

In [18]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date form of 'date month year' from the text."""
    pattern = re.compile(r'\d{1,2} [a-zA-Z]+ \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, "", ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_content = soup.find('div', {'class': 'last-unspace'})
    article_text = article_content.get_text(separator=' ', strip=True) if article_content else "Could not find the article text."
    
    date_container = soup.find('div', class_='field--field-published-date--item')
    article_date_text = date_container.get_text(strip=True) if date_container else None
    article_date = find_date_in_text(article_date_text) if article_date_text else None

    return article_date, article_text

def strip_date_from_title(title_with_date):
    """Delete the date at the start of the title and any following newlines/spaces."""
    pattern = re.compile(r'^\d{1,2} [a-zA-Z]+ \d{4}\s*[\n\r\s]*')
    # Replace the matched date and following whitespace/newlines with an empty string
    title_without_date = re.sub(pattern, '', title_with_date).strip()
    return title_without_date

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=0, end_page=0):
    """Crawl the articles on page0 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            print(f"Scraping page {page}...")
            time.sleep(1)
            page_url = f"{base_url}?page={page}" 
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            article_links = soup.findAll('div', {'class': 'mb-4'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                a_tag = link.find('a')
                if a_tag and a_tag['href']:
                    article_url = a_tag['href']
                    # Check if the URL is relative and prepend the base URL if necessary
                    if article_url.startswith('/'):
                        article_url = f"https://www.freedomfromtorture.org{article_url}"
                    article_date, article_text = scrape_article(article_url)
                                
                    if article_date and start_date <= article_date <= end_date:
                        article_title_with_date = link.text.strip()  # Original text containing both title and date
                        article_title = strip_date_from_title(article_title_with_date)  # Stripped title
                        writer.writerow(["Freedom from Torture", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                        print(f"Saved article: {article_title}")


# execution
base_url = 'https://www.freedomfromtorture.org/news'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)



Scraping page 0...
Saved article: Supreme Court rules plan to send refugees to Rwanda ‘unlawful’
Saved article: Freedom from Torture’s statement on Israel and the Occupied Palestinian Territories
Saved article: Bibby Stockholm: Why refugees and torture survivors shouldn’t be housed on floating prisons
Saved article: My heart aches for young women imprisoned and suffering in Iran today
Saved article: Sunak’s heartless proposal to force refugees to live on barges is a mental and physical health catastrophe waiting to happen
Saved article: 'Illegal Migration' Act - Everything you need to know
Saved article: Refugee Ban Bill will effectively extinguish the right to seek asylum in the UK
Saved article: Where does torture happen around the world?
Saved article: What is torture?
Saved article: Plan to send refugees to Rwanda ‘unlawful’ – A vital win as the Court of Appeal rules on the Government’s plan
Saved article: Banned: A peaceful protest to stand up for refugees
Saved article: Freedom f

Scraping articles from **Rainbow Migration**:

In [24]:

def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date form of 'date/month/year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d/%m/%Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    """Investigate different tags and classes to find the main article text."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return "", ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_text = ""
    
    # Check for 'post-content style-light double-bottom-padding' container
    main_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if main_content:
        # Extract all text from <p> and <h3> tags within this container
        for segment in main_content.find_all(['p', 'h3']):
            article_text += segment.get_text(strip=True) + " "
    
    # Check for 'uncode_text_column' containers
    uncode_columns = soup.findAll('div', class_='uncode_text_column')
    for column in uncode_columns:
        # Extract all text from <p> and <h3> tags within each 'uncode_text_column' container
        for segment in column.find_all(['p', 'h3']):
            article_text += segment.get_text(strip=True) + " "
    
    return article_text.strip()

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=3, end_page=11):
    """Crawl the articles between page3 and page11 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        # Loop through the pages from start_page to end_page
        for page_num in range(start_page, end_page + 1):
            page_url = f"{base_url}?upage={page_num}"
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            article_links = soup.findAll('div', {'class': 't-entry'})

            # Loop through the article links on the page
            for link in article_links:
                time.sleep(1)
                a_tag = link.find('a')
                if not a_tag or 'href' not in a_tag.attrs:
                    continue
                article_url = a_tag['href']
                article_text = scrape_article(article_url)
                # Extract the date from the 't-entry-date' span if it exists
                date_span = link.find('span', class_='t-entry-date')
                if date_span:
                    date_text = date_span.text
                    article_date = find_date_in_text(date_text)
                else:
                    article_date = None

                if article_date and start_date <= article_date <= end_date:
                    article_title = a_tag.text.strip() 
                    writer.writerow(["Rainbow Migration", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                    print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.rainbowmigration.org.uk/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Stop the Rwanda Bill!
Saved article: “I will have to hide my identity in my own room”
Saved article: Apply for a trainee solicitor position at Wilson’s
Saved article: Joint civil society statement on the Supreme Court ruling on the Rwanda Plan
Saved article: Joint Statement: LGBTQI+ people seeking safety here will not be sent to Rwanda
Saved article: We are hiring: Legal and Support Services Assistant
Saved article: A video guide to intersex asylum claims
Saved article: LGBTQI+ people shouldn’t be moved to a floating prison
Saved article: Enough: Trans people are people too
Saved article: Letter to the PM: Respect the lives of LGBTQI+ people and women seeking asylum in the UK
Saved article: Our response to the Home Secretary who thinks “being gay isn’t reason enough for asylum”
Saved article: “To have our human rights respected and protected has changed so many lives forever.”
Saved article: Brook House Inquiry findings: A gay man faced verbal homophobic abuse and was ou

Scraping *latest news* from **Women for Refugee Women**:

In [30]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 't-entry'})

        for link in article_links:
            time.sleep(1)
            a_tag = link.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue
            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            # Extract the date from the 't-entry-date' span if it exists
            date_span = link.find('span', class_='t-entry-date')
            if date_span:
                date_text = date_span.text
                article_date = find_date_in_text(date_text)
            else:
                article_date = None

            if article_date and start_date <= article_date <= end_date:
                article_title = a_tag.text.strip()
                writer.writerow(["Women for Refugee Women", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.refugeewomen.co.uk/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: The Supreme Court ruled the Rwanda Plan unlawful
Saved article: Suella Braverman’s speech: How it harms women and LGBTQ+ people
Saved article: A huge congratulations to Agnes and Loraine for winning the Pioneer 20 award!
Saved article: Passage of the ‘Illegal’ Migration Act
Saved article: Great news! The 72-hour time limit on the detention of pregnant women is maintained
Saved article: Putting Ourselves in the Picture: Rainbow Sisters Virtual Gallery!
Saved article: Campaign win! All legal advice surgeries in immigration detention must now take place face-to-face.
Saved article: Updated May 2023 – Joint briefing on the ‘Illegal Migration Bill’: Take action against the proposed new powers to detain pregnant women indefinitely
Saved article: We are recruiting a Campaigns and Advocacy Manager!
Saved article: See Us, Believe Us, Stand With Us
Saved article: Our Year: 2022


Scraping *blog articles* from **Women for Refugee Women**:

In [31]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'month date, year' from the text."""
    pattern = re.compile(r'[a-zA-Z]+ \d{1,2}, \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container. This includes text within <span> tags.
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 't-entry'})

        for link in article_links:
            time.sleep(1)
            a_tag = link.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue
            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            # Extract the date from the 't-entry-date' span if it exists
            date_span = link.find('span', class_='t-entry-date')
            if date_span:
                date_text = date_span.text
                article_date = find_date_in_text(date_text)
            else:
                article_date = None

            if article_date and start_date <= article_date <= end_date:
                article_title = a_tag.text.strip()
                writer.writerow(["Women for Refugee Women", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.refugeewomen.co.uk/news/blog/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Article content container not found.
Saved article: Welcome Every Woman: A Festive Celebration
Saved article: Celebrating this year’s empowerment activities
Saved article: Hiba’s Story: Pride 2023
Saved article: Ange’s Story: Pride 2023
Saved article: A Collaboration between Women for Refugee Women and The Five Points Brewing Co.
Saved article: Guest blog: Our Mothers Ourselves
Saved article: We held our first in-person Christmas party since 2019


Scraping articles from **Young Roots**:

In [25]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'date/month/year' from the text."""
    pattern = re.compile(r'\b\d{2}/\d{2}/\d{4}\b')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d/%m/%Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    # Concatenate base URL with the adjusted relative URL
    url = base_url.rstrip('/') + url
    # Remove any duplicate 'blog' in the URL
    url = url.replace('/blog/blog/', '/blog/')

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_text = ""
    
    # Check for 'post-content style-light double-bottom-padding' container
    main_content = soup.find('div', class_='post-content style-light double-bottom-padding')
    if main_content:
        # Extract all text from <p> and <h3> tags within this container
        for segment in main_content.find_all(['p', 'h3']):
            article_text += segment.get_text(strip=True) + " "

def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 'blog-basic-grid--text'})

        for link in article_links:
            time.sleep(1)
            a_tag = link.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue
            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            # Extract the date from the time class if it exists
            date_time = link.find('time', class_='blog-date')
            if date_time:
                date_text = date_time.text
                article_date = find_date_in_text(date_text)
            else:
                article_date = None

            if article_date and start_date <= article_date <= end_date:
                article_title = a_tag.text.strip()
                full_url = '≈' + article_url
                writer.writerow(["Young Roots", article_title, article_date.strftime('%d %B %Y'), full_url, article_text])
                print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.youngroots.org.uk/blog'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Young refugees are at unprecedented risk of homelessness
Saved article: Introducing our new Chief Executive, Paola Uccellari
Saved article: Young Roots Comedy Night


Scraping articles from **Sanctuary in Chichester**:

In [29]:
def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        article_content = soup.find('div', class_='entry-content')
        if article_content:
            return article_content.get_text(separator=" ", strip=True)
    print(f"Error or no content at {url}")
    return None

def generate_date_urls(start_date, end_date):
    """Generate URLs for each day between start_date and end_date."""
    current_date = start_date
    while current_date <= end_date:
        yield f"https://sanctuaryinchichester.org/{current_date.strftime('%Y/%m/%d/')}"
        current_date += timedelta(days=1)

def find_articles_on_page(url):
    """Fetch all article links from a given date URL."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = []
        for entry in soup.find_all('h2', class_='entry-title'):
            a_tag = entry.find('a', href=True)
            if a_tag and 'href' in a_tag.attrs:
                articles.append((a_tag['href'], a_tag.get_text(strip=True).replace(u'\xa0', u' ')))  # Replace non-breaking spaces
        return articles
    return []

def extract_date_from_url(url):
    """Extract the date from the article URL based on format '/YYYY/MM/DD/'."""
    date_match = re.search(r'/(\d{4}/\d{2}/\d{2})/', url)
    if date_match:
        return datetime.strptime(date_match.group(1), '%Y/%m/%d').strftime('%d %B %Y')
    return "Unknown Date"  # Fallback if no date found

def crawl_articles(start_date, end_date, csv_filename):
    """Crawl articles between the start and end dates."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line

        writer = csv.writer(file)
        for date_url in generate_date_urls(start_date, end_date):
            time.sleep(1)  # Delay to respect the server
            article_data = find_articles_on_page(date_url)
            for article_url, article_title in article_data:
                time.sleep(1)  # Respectful delay
                article_text = scrape_article(article_url)
                if article_text:
                    article_date = extract_date_from_url(article_url)
                    writer.writerow(["Sanctuary in Chichester", article_title, article_date, article_url, article_text])
                    print(f"Saved article: {article_title}")


csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', csv_filename)


Saved article: New schools liaison project
Saved article: Staff changes
Saved article: How Not to Drown – theatre fundraiser
Saved article: From Adversity to University
Saved article: Drop-in art
Saved article: 2023 New Year party
Saved article: Give Peace a Chance! – Benefit concert, featuring Ukrainian Volya choir
Saved article: New English support sessions for driving tests
Saved article: Sculptures and art cathedral exhibition celebrating resilience and values for Refugee Week
Saved article: Subjects of sculpture exhibition describe their experience of the project…
Saved article: A Culinary Journey: Exploring Syria Through Cuisine
Saved article: ‘Resilience in Clay’ Sculpture Subjects Describe Their Experiences at Cathedral Event
Saved article: Giving Back: Beneficiaries’ Empowerment through Volunteering with UK Harvest and St Wilfrid’s Hospice
Saved article: Celebrating Togetherness at Our Annual Summer Party &
Saved article: Report on Our Summer Intensive English Course
Saved art

Scraping articles from **STAR (Student Action for Refugees)**:

In [40]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date form of 'date month year' from the text."""
    pattern = re.compile(r'\d{1,2} [a-zA-Z]+ \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the article text and title from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, "Could not fetch article"

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the title from the <h1> tag
    title_tag = soup.find('h1', class_='ast-advanced-headers-title')
    article_title = title_tag.get_text(strip=True) if title_tag else "Title Not Found"

    # Extract the content from the <div> class
    article_content = soup.find('div', {'class': 'entry-content'})
    article_text = article_content.get_text(separator=' ', strip=True) if article_content else "Could not find the article text."

    return article_text, article_title

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=1, end_page=3):
    """Crawl the articles from page 1 to 3 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            page_url = f"{base_url}{page}/"
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            article_links = soup.findAll('div', {'class': 'post-content ast-grid-common-col'})

            for link in article_links:
                time.sleep(1)
                a_tag = link.find('a')
                if not a_tag or 'href' not in a_tag.attrs:
                    continue
                article_url = a_tag['href']
                article_text, article_title = scrape_article(article_url)
                # Extract the date from the span class if it exists
                date_time = link.find('span', class_='published')
                if date_time:
                    date_text = date_time.text
                    article_date = find_date_in_text(date_text)
                else:
                    article_date = None

                if article_date and start_date <= article_date <= end_date:
                    writer.writerow(["STAR", article_title, article_date, article_url, article_text])
                    print(f"Saved article: {article_title}")


# execution
base_url = 'https://star-network.org.uk/category/news/page/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Sign up for an info session to find out more about how to apply to university in the UK as a Refugee/Asylum seeker
Saved article: Open up safe routes to the UK for those seeking refuge from the Gaza/ Israel conflict
Saved article: No Dad Jokes Without Dad
Saved article: Why you should join STAR in 2023-2024
Saved article: Good news for refugees wanting to study in the UK
Saved article: STAR’s new stickers: A big thank you to FastPrint!
Saved article: Statement on the passage of the Illegal Migration Act
Saved article: University students and staff oppose the Illegal Migration Bill
Saved article: STAR’s AGM and National Meetup 2023!
Saved article: Refugee Week
Saved article: A 1,000 mile journey for STAR!
Saved article: Celebrating STARs this Volunteers Week – Jiayi
Saved article: Celebrating STARs this Volunteers Week – Peter
Saved article: Celebrating STARs this Volunteers Week – Shehany
Saved article: How to keep fighting the Refugee Ban Bill
Saved article: Looking for

Scraping articles from **Racial Justice Network**:

In [49]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date form of 'month/date/year' from the text."""
    pattern = re.compile(r'\b\d{2}/\d{2}/\d{4}\b')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%m/%d/%Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='helpo_content_wrapper')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 'elementor-post__text'})

        for link in article_links:
            time.sleep(1)
            a_tag = link.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue
            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            # Extract the date from the span class if it exists
            date_time = link.find('span', class_='elementor-post-date')
            if date_time:
                date_text = date_time.text
                article_date = find_date_in_text(date_text)
            else:
                article_date = None

            if article_date and start_date <= article_date <= end_date:
                article_title = a_tag.text.strip()
                writer.writerow(["Racial Justice Network", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                print(f"Saved article: {article_title}")

# execution
base_url = 'https://racialjusticenetwork.co.uk/blogs-and-articles/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Exploring Racial Justice: The ‘Rwanda Policy’ Debacle and the Constraints of Our Electoral-Democratic System
Saved article: Standing in solidarity with our siblings in Palestine, Sudan, the Democratic Republic of Congo and Yemen, against continued colonisation
Saved article: Annual Gather Up Celebration 2023
Saved article: Celebrating and Reflecting – International Symposium with Elder Ngũgĩ wa Thiong’o
Saved article: Press Release – International Symposium
Saved article: URC Reflections: Wealth Distribution
Saved article: Stop The Data Discrimination:
Saved article: Reflections on Unlearning Racism
Saved article: The Bounce Back: Storytelling as a Form of Resilience
Saved article: Resisting The Migration Bill
Saved article: EVENT: Solidarity With Congo
Saved article: Event; Stop The Bill
Saved article: The Hola Massacre: the last straw that toppled colonial Kenya
Saved article: Celebrating 7 years of RJN
Saved article: Stop the Criminalisation of Our Earth Defenders!


Scraping articles from **Hastings Community of Sanctuary**:

In [4]:
def scrape_article(url):
    """Scrape the article text from the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        article_content = soup.find('section', class_='entry-content clearfix')
        if article_content:
            return article_content.get_text(separator=" ", strip=True)
    print(f"Error or no content at {url}")
    return None

def generate_date_urls(start_date, end_date):
    """Generate URLs for each day between start_date and end_date."""
    current_date = start_date
    while current_date <= end_date:
        yield f"https://hastings.cityofsanctuary.org/{current_date.strftime('%Y/%m/%d/')}"
        current_date += timedelta(days=1)

def find_articles_on_page(url):
    """Fetch all article links from a given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = []
        article_links = soup.find_all('a', class_='post-link')
        for article_link in article_links:
            # The title is within an h4 tag, nested inside a div with class 'content'
            title_tag = article_link.find('div', class_='content').find('h4', class_='title')
            title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'

            # The link is the href attribute of the a tag
            link = article_link.get('href')

            if link:  # Only append if there's a valid link
                articles.append((link, title))
            else:
                print(f"Missing link for article titled '{title}' at {url}")

        return articles
    else:
        return []  # Return an empty list if the status code is not 200

def extract_date_from_url(url):
    """Extract the date from the article URL based on format '/YYYY/MM/DD/'."""
    date_match = re.search(r'/(\d{4}/\d{2}/\d{2})/', url)
    if date_match:
        return datetime.strptime(date_match.group(1), '%Y/%m/%d').strftime('%d %B %Y')
    return "Unknown Date"  # Fallback if no date found

def crawl_articles(start_date, end_date, csv_filename):
    """Crawl articles between the start and end dates."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line

        writer = csv.writer(file)
        for date_url in generate_date_urls(start_date, end_date):
            time.sleep(1)  # Delay to respect the server
            article_data = find_articles_on_page(date_url)
            for article_url, article_title in article_data:
                time.sleep(1)  # Respectful delay
                article_text = scrape_article(article_url)
                if article_text:
                    article_date = extract_date_from_url(article_url)
                    writer.writerow(["Hastings Community of Sanctuary", article_title, article_date, article_url, article_text])
                    print(f"Saved article: {article_title}")


csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', csv_filename)

Saved article: Joint Statement on Northeye: 4th September 2023
Saved article: Why we oppose the plans for Northeye
Saved article: East Sussex Library & Information Service gains Sanctuary Award


Scraping articles from **Scottish Refugee Council**:

In [82]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date in the format 'month date, year' from the text."""
    pattern = re.compile(r'\b([a-zA-Z]+) (\d{1,2}), (\d{4})\b')
    match = pattern.search(text)
    if match:
        date_str = f"{match.group(1)} {match.group(2)}, {match.group(3)}"  # Reformats the date to 'Month day, Year'
        return datetime.strptime(date_str, '%B %d, %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, "", ""

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the main article content
    article_content = soup.find('article', {'class': 'post-content'})
    article_text = article_content.get_text(separator=' ', strip=True) if article_content else "Could not find the article text."

    # Extract the article title
    title_container = soup.find('div', class_='my-3')
    article_title = title_container.find('h1').get_text(strip=True) if title_container and title_container.find('h1') else "Title Not Found"

    # Locate the date container, targeting the structure provided
    date_container = soup.find('h6', class_='contforcat')
    if date_container:
        article_date_text = date_container.get_text(strip=True)
        # Remove the prefix "Posted on " to isolate the date
        article_date_text = article_date_text.replace('Posted on ', '')
        article_date = find_date_in_text(article_date_text)
    else:
        article_date = None

    return article_date, article_text, article_title

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=2, end_page=9):
    """Crawl the articles from page 2 to 9 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            print(f"Scraping page {page}...")
            time.sleep(1)
            page_url = f"{base_url}page/{page}" 
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            article_links = soup.findAll('div', {'class': 'card'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                a_tag = link.find('a')
                if a_tag and a_tag['href']:
                    article_url = a_tag['href']
                    article_date, article_text, article_title = scrape_article(article_url)
                                
                    if article_date and start_date <= article_date <= end_date:
                        writer.writerow(["Scottish Refugee Council", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                        print(f"Saved article: {article_title}")


# execution
base_url = 'https://scottishrefugeecouncil.org.uk/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Scraping page 2...
Saved article: Rwanda Bill: We urge MPs to reject legislation
Saved article: Fair Begins Here: it’s time to make the asylum system work for everyone
Saved article: Refugee Councils chosen for Guardian Charity Appeal
Saved article: Housing emergency must end
Saved article: Help refugees in need this winter
Saved article: Rise: Refugee Festival Scotland 2024
Saved article: Empowering Practitioners: Meet Our New Training Officer
Scraping page 3...
Saved article: Join us to celebrate Aref Ghorbani’s new EP No More Shadows
Saved article: Rwanda ruling: Supreme Court rules against UK government
Saved article: Hopes increase for free bus travel for people seeking asylum
Saved article: Hotels are no place for children to live – calls to stop housing families and children in hotel rooms
Saved article: Human rights and asylum in Scotland – government must act on findings in new report
Saved article: A helping hand for families
Saved article: Supporting people in crisis
Saved a

Scraping *blogs* from **Jesuit Refugee Service UK**:

In [11]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date in the format 'date month year' from the text."""
    pattern = re.compile(r'\d{1,2} [a-zA-Z]+ \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, ""

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the main article content
    article_content = soup.find('div', {'class': 'col100 singlearticletext'})
    if article_content:
        # Retrieve all text content from <p> tags within the article content
        paragraphs = article_content.find_all('p')
        article_text = ' '.join(p.get_text(strip=True) for p in paragraphs) if paragraphs else "Could not find any text content."
    else:
        article_text = "Could not find the article text."

    # Extract the article title from the designated div class and h1
    title_container = soup.find('div', class_='pageheadercolumn1row1content whitebg posrel')
    article_title = title_container.find('h1').get_text(strip=True) if title_container and title_container.find('h1') else "Title Not Found"

    return article_text, article_title

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=2, end_page=7):
    """Crawl the articles from page 2 to page 7 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            print(f"Scraping page {page}...")
            time.sleep(1)
            page_url = f"{base_url}page/{page}" 
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            article_links = soup.findAll('div', {'class': 'newsitem lightbluebg'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                # Find the 'newsitemtext' container within the 'newsitem lightbluebg' block
                newsitem_text = link.find('div', class_='newsitemtext')
                
                if newsitem_text:
                    # Find 'a' tag within 'h2' which is inside 'newsitemtext'
                    a_tag = newsitem_text.find('h2').find('a') if newsitem_text.find('h2') else None
                    if a_tag and 'href' in a_tag.attrs:
                        article_url = a_tag['href']
                        article_text, article_title = scrape_article(article_url)
                        # Output or processing logic for the fetched article details

                    # Find 'h4' within 'newsitemtext' for the date
                    date_h4 = newsitem_text.find('h4')
                    if date_h4:
                        date_text = date_h4.get_text(strip=True)
                        article_date = find_date_in_text(date_text)
                    else:
                        article_date = None
                                
                    if article_date and start_date <= article_date <= end_date:
                        writer.writerow(["Jesuit Refugee Service UK", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                        print(f"Saved article: {article_title}")


# execution
base_url = 'https://www.jrsuk.net/blog/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Scraping page 2...
Saved article: At Home with the Jesuit Young Adult Ministry
Saved article: Two years of impactful collaboration: JRS & World Medicine
Saved article: “I’ve discovered friendship, resilience, good humour and triumph over adversity”
Saved article: “Step back into time”
Saved article: Volunteering with Acupuncture
Saved article: £10 to stay connected
Saved article: Why it’s important to observe World Day of Migrants and Refugees
Saved article: Two months and already feeling At Home…
Saved article: “To escape from our tumultuous lives and spend some time with Mother Nature”
Scraping page 3...
Saved article: Countering the hostile environment through hospitality
Saved article: “It is better to light a candle than to curse the darkness”
Saved article: My experience at JRS was nothing short of amazing, invaluable, and extremely eye opening.
Saved article: From the Himalayas to Wapping with love
Saved article: Creating community with the help of volunteers
Saved article: “A d

Scraping *news* from **Jesuit Refugee Service UK**:

In [12]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date in the format 'date month year' from the text."""
    pattern = re.compile(r'\d{1,2} [a-zA-Z]+ \d{4}')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, ""

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the main article content
    article_content = soup.find('div', {'class': 'col100 singlearticletext'})
    if article_content:
        # Retrieve all text content from <p> tags within the article content
        paragraphs = article_content.find_all('p')
        article_text = ' '.join(p.get_text(strip=True) for p in paragraphs) if paragraphs else "Could not find any text content."
    else:
        article_text = "Could not find the article text."

    # Extract the article title from the designated div class and h1
    title_container = soup.find('div', class_='pageheadercolumn1row1content whitebg posrel')
    article_title = title_container.find('h1').get_text(strip=True) if title_container and title_container.find('h1') else "Title Not Found"

    return article_text, article_title

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=1, end_page=4):
    """Crawl the articles from page 1 to page 4 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            print(f"Scraping page {page}...")
            time.sleep(1)
            page_url = f"{base_url}page/{page}" 
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')

            article_links = soup.findAll('div', {'class': 'newsitem blue3bg whitetext'})

            for link in article_links:
                time.sleep(1)  # Polite delay between requests
                # Find the 'newsitemtext' container within the 'newsitem blue3bg whitetext' block
                newsitem_text = link.find('div', class_='newsitemtext')
                
                if newsitem_text:
                    # Find 'a' tag within 'h2' which is inside 'newsitemtext'
                    a_tag = newsitem_text.find('h2').find('a') if newsitem_text.find('h2') else None
                    if a_tag and 'href' in a_tag.attrs:
                        article_url = a_tag['href']
                        article_text, article_title = scrape_article(article_url)
                        # Output or processing logic for the fetched article details

                    # Find 'h4' within 'newsitemtext' for the date
                    date_h4 = newsitem_text.find('h4')
                    if date_h4:
                        date_text = date_h4.get_text(strip=True)
                        article_date = find_date_in_text(date_text)
                    else:
                        article_date = None
                                
                    if article_date and start_date <= article_date <= end_date:
                        writer.writerow(["Jesuit Refugee Service UK", article_title, article_date.strftime('%d %B %Y'), article_url, article_text])
                        print(f"Saved article: {article_title}")


# execution
base_url = 'https://www.jrsuk.net/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Scraping page 1...
Saved article: JRS UK calls for Bibby Stockholm to be closed following reported suicide
Scraping page 2...
Saved article: JRS UK welcomes Supreme Court Ruling on Rwanda
Saved article: Write a Christmas Card for people held in immigration detention
Saved article: Advent Service 2023
Saved article: JRS UK Opens Amani House for Male Refugee Friends
Saved article: JRS responds to the reports of the contents of the Home Secretary’s speech to the American Enterprise Institute later today
Saved article: The Jesuit Refugee Service (JRS) UK renews calls for an end immigration detention in wake of report on abuse in Brook House IRC
Saved article: People have now been moved onto the Bibby Stockholm
Saved article: Illegal Migration Bill becomes UK Law
Saved article: Rwanda Scheme found to be UNLAWFUL!
Saved article: “My thoughts are my secret power”: You can see me, but I don’t exist
Scraping page 3...
Saved article: JRS UK renews calls for government to abandon Illegal Migratio

Scraping articles from **ECPAT UK**:

In [34]:
def find_date_in_html_element(soup):
    """Extracts the date from a 'time' HTML element's 'datetime' attribute within the soup."""
    date_element = soup.find('time', id='time')
    if date_element and 'datetime' in date_element.attrs:
        try:
            date = datetime.strptime(date_element['datetime'], '%Y-%m-%d').strftime('%d %B %Y')
            return date
        except ValueError:
            return None
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    # Ensure URL starts with http:// or https://
    if not url.startswith(('http://', 'https://')):
        url = base_url.rstrip('/') + '/' + url.lstrip('/')
    
    # Correcting potential duplicate path issues
    url = url.replace('/news/news/', '/news/')
    
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}, Status Code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    article_content = soup.find('div', class_='content postContent newsContent')
    if not article_content:
        print("Article content container not found.")
        return None

    return article_content.get_text(separator=" ", strip=True)


def crawl_articles(start_date, end_date, base_url, csv_filename):
    """Crawl the articles between the start and end dates."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
            
        writer = csv.writer(file)
        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.findAll('div', {'class': 'listedPostText'})

        for block in articles:
            a_tag = block.find('a')
            if not a_tag or 'href' not in a_tag.attrs:
                continue

            article_url = a_tag['href']
            article_text = scrape_article(article_url)
            article_title = a_tag.get_text(strip=True)

            # Attempt to find the date element by navigating to the common parent
            details_wrapper = block.parent.find('div', class_='publishDetailsWrapper')
            if details_wrapper:
                article_date = find_date_in_html_element(details_wrapper)
            else:
                article_date = None

            if article_date:
                article_date_obj = datetime.strptime(article_date, '%d %B %Y')
                if start_date <= article_date_obj <= end_date:
                    full_url = 'https://www.ecpat.org.uk' + article_url
                    writer.writerow(["ECPAT UK", article_title, article_date, full_url, article_text])
                    print(f"Saved article: {article_title}")

# execution
base_url = 'https://www.ecpat.org.uk/news'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Home Affairs Committee speaks out against Home Office treatment of migrant children and calls for increased care for child victims of trafficking in new report.
Saved article: Judge rules that the Home Secretary acted irrationally in the case brought by Kent County Council on the operation of the National Transfer Scheme
Saved article: King’s Speech announces new laws to tackle online child exploitation whilst reconfirming the UK government’s desire to remove the rights of child victims
Saved article: ECPAT UK welcomes the announcement of a new Independent Anti-Slavery Commissioner
Saved article: High Court orders Home Office to transfer unaccompanied children from hotels into care
Saved article: High Court retains oversight of the treatment of unaccompanied children
Saved article: ECPAT UK wins legal challenge on the unlawful accommodation of unaccompanied children
Saved article: ECPAT UK joined court hearing for its legal challenge to Kent County Council and the Home S

Scraping articles from **Migrants Organise**:

In [73]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date in the format 'Month dayth, year' from the text."""
    # Updated regex to handle different day suffixes and potential extra spaces
    pattern = re.compile(r'\b([a-zA-Z]+)\s+(\d{1,2})(st|nd|rd|th),\s+(\d{4})\b')
    match = pattern.search(text)
    if match:
        month, day, _, year = match.groups()
        # Construct date string without the ordinal suffix and handle possible leading zeros in day
        date_str = f"{month} {int(day)} {year}"
        # Parse the date in the desired output format
        try:
            parsed_date = datetime.strptime(date_str, '%B %d %Y').strftime('%d %B %Y')
            return parsed_date
        except ValueError as e:
            print(f"Date parsing error: {e} with date_str: {date_str}")
    else:
        print(f"No date match found in text: {text}")
    return None

def scrape_article(url):
    """Scrape the article text and publication date from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return None, ""

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the main article content
    article_content = soup.find('div', {'class': 'entry-content'})
    article_text = article_content.get_text(separator=' ', strip=True) if article_content else "Could not find the article text."

    # Extract the article title from the designated header class
    title_container = soup.find('h1', class_='entry-title')
    article_title = title_container.get_text(strip=True) if title_container else "Title Not Found"

    return article_text, article_title

def crawl_articles(start_date, end_date, base_url, csv_filename, start_page=2, end_page=5):
    """Crawl the articles from page 2 to page 5 that were published between the start and end dates."""
    """Save the organisation name, article title, date, link and text to a CSV file."""
    """Delay 1 second between requests to be polite."""
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:  # 'a' mode for appending
        file.seek(0, os.SEEK_END)
        if file.tell() != 0:  # File is not empty
            file.write('\n')  # Ensure starts on a new line
        
        writer = csv.writer(file)

        for page in range(start_page, end_page + 1):
            page_url = f"{base_url}page/{page}"
            print(f"Scraping page {page} at {page_url}")
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"Failed to fetch {page_url}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            article_links = soup.findAll('div', class_='card-body p-0 latest-news')

            for link in article_links:
                time.sleep(1)
                h4 = link.find('h4')
                if not h4:
                    print("No h4 element found within article link div.")
                    continue
                a_tag = h4.find('a')
                if not a_tag or 'href' not in a_tag.attrs:
                    print("No valid a tag found or missing href.")
                    continue
                article_url = a_tag['href']
                print(f"Scraping article at {article_url}")

                article_text, article_title = scrape_article(article_url)
                date_container = link.find_next_sibling('div', class_='card-footer px-0')
                if not date_container:
                    print("No date container found for this article.")
                    continue

                date_text = date_container.get_text(strip=True)
                article_date = find_date_in_text(date_text)
                if not article_date:
                    print("No valid date extracted.")
                    continue

                if article_date and start_date <= datetime.strptime(article_date, '%d %B %Y') <= end_date:
                    writer.writerow(["Migrants Organise", article_title, article_date, article_url, article_text])
                    print(f"Saved article: {article_title}")


# execution
base_url = 'https://www.migrantsorganise.org/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Scraping page 2 at https://www.migrantsorganise.org/news/page/2
Scraping article at https://www.migrantsorganise.org/homes4all/
Scraping article at https://www.migrantsorganise.org/hope-a-migrants-organise-exhibition/
Scraping article at https://www.migrantsorganise.org/group-activities-nourishment-and-care/
Scraping article at https://www.migrantsorganise.org/our-2023-round-up/
Scraping article at https://www.migrantsorganise.org/wewontstop/
Scraping article at https://www.migrantsorganise.org/new-guide-challenging-notice-to-move-to-the-bibby-stockholm-barge/
Saved article: New Guide: Challenging Notice to Move to the Bibby Stockholm Barge
Scraping article at https://www.migrantsorganise.org/celebrating-30-years-of-migrants-organise/
Saved article: Celebrating 30 Years of Migrants Organise
Scraping article at https://www.migrantsorganise.org/endcolonialcomplicity-palestine-solidarity-statement/
Saved article: #EndColonialComplicity: Palestine Solidarity Statement
Scraping article at h

Scraping articles from **Safe Passage**:

In [95]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date in the format 'day Month year'."""
    pattern = re.compile(r'\b(\d{1,2})\s([A-Z][a-z]+)\s(\d{4})\b')
    match = pattern.search(text)
    if match:
        # Format the date as 'day Month Year' and return
        return datetime.strptime(match.group(0), '%d %B %Y').strftime('%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the article text from the given URL."""
    if not url.startswith(('http://', 'https://')):
        url = base_url.rstrip('/') + '/' + url.lstrip('/')
    # Correcting potential duplicate path issues
        url = url.replace('/news/news/', '/news/')
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='sqs-layout sqs-grid-12 columns-12')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:
        if file.tell() != 0:  # Check if the file is not empty
            file.write('\n')  # Ensure starts on a new line
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('a', {'class': 'BlogList-item-title'})

        for a_tag in article_links:
            article_title = a_tag.get_text(strip=True)
            article_url = a_tag['href']

            article_title = a_tag.text.strip()
            full_url = 'https://www.safepassage.org.uk' + article_url

            date_tag = a_tag.find_next('time', class_='Blog-meta-item--date')
            if date_tag:
                date_text = date_tag.get_text(strip=True)
                article_date = find_date_in_text(date_text)
                if article_date and start_date <= datetime.strptime(article_date, '%d %B %Y') <= end_date:
                    article_text = scrape_article(full_url)
                    writer.writerow(["Safe Passage", article_title, article_date, full_url, article_text])
                    print(f"Saved article: {article_title}")
            else:
                print("No date tag found for this article.")

# execution
base_url = 'https://www.safepassage.org.uk/news/'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: A #MessageInABottle for refugees
Saved article: Our reaction to supreme court ruling on Rwanda plan


Scrape articles from **Safe Passage**:

In [98]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date in the format 'day Month year'."""
    pattern = re.compile(r'\b(\d{1,2})\s([A-Z][a-z]+)\s(\d{4})\b')
    match = pattern.search(text)
    if match:
        # Format the date as 'day Month Year' and return
        return datetime.strptime(match.group(0), '%d %B %Y').strftime('%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the artitextcle  from the given URL."""
    base_url_cleaned = base_url.split('?')[0]  # Remove query parameters
    url = base_url_cleaned + url
    # Correcting potential duplicate path issues
    url = url.replace('/news/news/', '/news/')
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='sqs-layout sqs-grid-12 columns-12')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:
        if file.tell() != 0:  # Check if the file is not empty
            file.write('\n')  # Ensure starts on a new line
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('a', {'class': 'BlogList-item-title'})

        for a_tag in article_links:
            article_title = a_tag.get_text(strip=True)
            article_url = a_tag['href']

            article_title = a_tag.text.strip()
            full_url = 'https://www.safepassage.org.uk' + article_url

            date_tag = a_tag.find_next('time', class_='Blog-meta-item--date')
            if date_tag:
                date_text = date_tag.get_text(strip=True)
                article_date = find_date_in_text(date_text)
                if article_date and start_date <= datetime.strptime(article_date, '%d %B %Y') <= end_date:
                    article_text = scrape_article(article_url)
                    writer.writerow(["Safe Passage", article_title, article_date, full_url, article_text])
                    print(f"Saved article: {article_title}")
            else:
                print("No date tag found for this article.")

# execution
base_url = 'https://www.safepassage.org.uk/news?offset=1700045454524'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Statement on the Palestine/Israel conflict
Saved article: Empty promises, shattered lives: the truth about Afghan resettlement schemes
Saved article: Routes to safety: a new approach to people crossing the Channel
Saved article: Safe Passage responds to Suella Braverman's statement on refugee protection
Saved article: Responding to Labour's promise to restore the right to asylum
Saved article: More loss of life at sea in search of safety
Saved article: Responding to tragic loss of life in the Channel
Saved article: Our response to the Illegal Migration Bill becoming law
Saved article: Our response to tragic shipwreck off Greece
Saved article: Announcement on leadership at Safe Passage


In [99]:
base_url = 'https://www.safepassage.org.uk/news?offset=1685633008001'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: Government's new Bill to stop Channel crossings will not work
Saved article: Campaigning to #ReuniteAfghanFamilies
Saved article: Dear Prime Minister, help Afghan families reunite
Saved article: Campaigner Spotlight: Farhad's Dissertation


Scraping articles from **Kent Refugee Action Network**:

In [104]:
def find_date_in_text(text):
    """Define and apply a regex pattern to extract the date from of 'date/month/year' from the text."""
    pattern = re.compile(r'\b\d{2}/\d{2}/\d{4}\b')
    match = pattern.search(text)
    if match:
        return datetime.strptime(match.group(), '%d/%m/%Y').strftime('%d %B %Y')
    return None

def scrape_article(url):
    """Scrape the artitextcle  from the given URL."""
    base_url_cleaned = base_url.split('?')[0]  # Remove query parameters
    url = base_url_cleaned + url
    # Correcting potential duplicate path issues
    url = url.replace('/lateststories/lateststories/', '/lateststories/')
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching article: {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the container div for the article text
    article_content = soup.find('div', class_='sqs-html-content')
    if not article_content:
        print("Article content container not found.")
        return "Could not find the article text."
    
    # Extract and return all text within the container. This includes text within <span> tags.
    article_text = article_content.get_text(separator=" ", strip=True)
    return article_text

def crawl_articles(start_date, end_date, base_url, csv_filename):
    start_date = datetime.strptime(start_date, '%d %B %Y')
    end_date = datetime.strptime(end_date, '%d %B %Y')

    with open(csv_filename, 'a', newline='', encoding='utf-8') as file:
        if file.tell() != 0:  # Check if the file is not empty
            file.write('\n')  # Ensure starts on a new line
        writer = csv.writer(file)

        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch {base_url}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = soup.findAll('div', {'class': 'blog-item-text'})

        for link in article_links:
            time.sleep(2)
            h1_tag = link.find('h1', class_='blog-title')  # Assuming this is where the link is
            if h1_tag:
                a_tag = h1_tag.find('a')
                if a_tag and 'href' in a_tag.attrs:
                    article_url = a_tag['href']
                    full_url = 'https://kran.org.uk' + article_url
                    article_title = a_tag.get_text(strip=True)

                    # Finding the date in the neighboring meta section
                    meta_section = link.parent.find('div', class_='blog-meta-section')
                    if meta_section:
                        date_tag = meta_section.find('time', class_='blog-date')
                        if date_tag:
                            date_text = date_tag.get_text(strip=True)
                            article_date = find_date_in_text(date_text)
                            if article_date and start_date <= datetime.strptime(article_date, '%d %B %Y') <= end_date:
                                article_text = scrape_article(article_url)
                                writer.writerow(["Kent Refugee Action Network", article_title, article_date, full_url, article_text])
                                print(f"Saved article: {article_title}")
                        else:
                            print("No date tag found for this article.")
                    else:
                        print("No meta section found for this article.")

# execution
base_url = 'https://kran.org.uk/lateststories?offset=1702996817564'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: making it all worthwhile
Saved article: Leading the way to achievement
Saved article: more than just a mentor
Saved article: statement on RWANDA ruling
Saved article: “kindness is WHAT kran DOES”
Saved article: WHY collaboration is key
Saved article: struggle hits home
Saved article: from couch to 26.2 miles!
Saved article: DOING IT FOR OURSELVES
Saved article: loach: hope not hate
Saved article: gone but never forgotten
Saved article: Giving peace a chance
Saved article: degree dream come true
Saved article: wheelie big success!
Saved article: DoAA means business
Saved article: sharing Dr King’s dream
Saved article: No end to their talents!


In [105]:
base_url = 'https://kran.org.uk/lateststories?offset=1692700106397'
csv_filename = '/Users/yijingxiao/Desktop/ASDS dissertation/dissertation-data/article_text.csv'
crawl_articles('13 December 2022', '12 December 2023', base_url, csv_filename)

Saved article: looking beyond labels
Saved article: be part of real change
Saved article: left with no choice
Saved article: there is always hope
Saved article: bill’s DEVASTATING IMPACT
Saved article: first-class time at kran
Saved article: this journey is difficult
Saved article: new life and hope
Saved article: helping to reignite hope
Saved article: noteworthy partnership
Saved article: One team, one dream
Saved article: Mental health matters
Saved article: Happy Holidays!
Saved article: KRAN Bike Project
