Student id - 24205067  
Dataset - Dublin Rentals - http://mlg.ucd.ie/modules/python/assignment1/rental/index.html

In [20]:
import urllib.request
import time
import csv
from bs4 import BeautifulSoup

### Prepare the URL for web scraping -

In [21]:
# Base URL for the rental pages
BASE_URL = 'http://mlg.ucd.ie/modules/python/assignment1/rental/'
# Main page URL
MAIN_PAGE_URL = BASE_URL + 'index.html'

### Use `urllib` to fetch and return the html content as string -

In [22]:
def get_html(url):
    response = urllib.request.urlopen(url)
    html = response.read().decode() 
    return html

### 1. `get_quarter_links` function
This function scrapes the **main page URL** to find links for each quarter’s first page.

- Downloads the HTML of the main page
- Finds the relevant `<div>` blocks and anchor tags
- Builds a dictionary mapping each quarter identifier (e.g., `Q1`, `Q2`, etc.) to the quarter’s first page URL

This dictionary will be used later to start scraping for each quarter.

In [23]:
def get_quarter_links(main_page_url):
    html = get_html(main_page_url)
    soup = BeautifulSoup(html, 'html.parser')
    quarter_links = {}

    # We get all <div> elements with id="content" and class "col-md-12", which stores the links to  different quarters, as identified 
    # from the source html
    content_divs = soup.find_all("div", {"id": "content", "class": "col-md-12"})
    
    # Then we iterate over the divs and choose the one that contains the quarter links (identified by an <h4> tag)
    for div in content_divs:
        if div.find("h4"):
            # Now find all inner div tags containing the quarter links
            for sub_div in div.find_all("div"):
                a_tag = sub_div.find("a")
                if a_tag and a_tag.get("href"):
                    href = a_tag["href"]
                    # Extract quarter identifier from URL, e.g., "Q1" from "Q1-page01.html"
                    quarter = href.split("-")[0]
                    full_url = urllib.request.urljoin(main_page_url, href)
                    quarter_links[quarter] = full_url
            break  # stop once the correct div is found
    return quarter_links

### 2. `get_total_pages` function
For each quarter’s first page, this function figures out how many pages of data exist.

- Loads the first page of the quarter
- Looks for a header (e.g., `Rentals: Q1 — Page 1 of 26`)
- Extracts the total number of pages from the text

These pages will be used in a later step to dynamically create the page URLs scrape them all.

In [24]:
def get_total_pages(quarter_first_page_url):
    html = get_html(quarter_first_page_url)
    soup = BeautifulSoup(html, "html.parser")
    h2_tag = soup.find("h2")
    total_pages = 1  # default if not found
    if h2_tag:
        text = h2_tag.get_text(strip=True)
        # We split on 'of' and take the second part of the string for number of pages
        if "of" in text:
            try:
                total_pages = int(text.split("of")[-1].strip())
            except ValueError:
                pass
    return total_pages

### 3. `scrape_rental_page` function
Given a single page URL -

- Downloads the HTML
- Finds each `<li>` block that corresponds to a rental record
- Extracts the date and table of rental details
- Compiles them into a list of dictionaries, one dictionary per record

We’ll call this repeatedly for each page within a quarter.


In [25]:
def scrape_rental_page(page_url):
    html = get_html(page_url)
    soup = BeautifulSoup(html, "html.parser")
    records_data = []

    # We need to find all the <li> tags that represent a rental record.
    records = soup.find_all("li")
    for record in records:
        # The span tag with class "record" stores the listing date.
        date_span = record.find("span", {"class": "record"})
        if not date_span:
            continue
        record_date = date_span.text.strip()
        
        # The rest of the data is in the table with class "rental".
        table = record.find("table", {"class": "rental"})
        if not table:
            continue
        
        record_details = {"Date": record_date}
        rows = table.find_all("tr")
        for row in rows:
            field_td = row.find("td", {"class": "field"})
            if not field_td:
                continue
            # The field names contain ':' so we need to clean the field name by removing any colon.
            field_name = field_td.text.strip().replace(":", "")
            # Get the second <td> as the field value. (Observed - 2 <td> per field to store the field name and value)
            cells = row.find_all("td")
            if len(cells) < 2:
                continue
            field_value = cells[1].text.strip()
            record_details[field_name] = field_value
        records_data.append(record_details)
    return records_data

### 4. `scrape_quarter` function
This function loops over all pages for a given quarter. For each page:

- Builds a page-specific URL (e.g., `Q1-page01.html`, `Q1-page02.html`, etc.)
- Scrapes the page with `scrape_rental_page`
- Accumulates all records in a single list

In the end, it returns a comprehensive list of records for that entire quarter.


In [26]:
def scrape_quarter(quarter, total_pages):
    all_records = []
    for page in range(1, total_pages + 1):
        # Construct URL with page numbers (e.g., Q1-page04.html)
        url = f"{BASE_URL}{quarter}-page{page:02d}.html"
        print(f"Scraping {url} ...")
        page_records = scrape_rental_page(url)
        all_records.extend(page_records)
        time.sleep(0.5)  # delay to avoid overwhelming the server
    return all_records

### 5. `save_to_csv` function
Once we’ve gathered all records, we can save them to a CSV. This function:

- Takes a list of dictionaries (the rental records)
- Identifies all the keys from the first dictionary
- Writes out a CSV file with those keys as column headers

Useful for storing the final combined dataset.


In [27]:
def save_to_csv(records, filename):
    if not records:
        print("No records to save.")
        return
    keys = list(records[0].keys())
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(records)
    print(f"Records successfully saved to {filename}")

### 6. Main entry point
Finally, we tie everything together:

1. Get all quarter links from the main page
2. For each quarter, figure out the total pages
3. Scrape every page within that quarter, appending all results
4. Tag each record with the quarter label
5. Save everything to a single CSV

If, for any reason, no quarter links are found, we exit early.


In [28]:
if __name__ == "__main__":
    # Step 1: Scrape the main page for quarter links
    quarter_links = get_quarter_links(MAIN_PAGE_URL)
    if not quarter_links:
        print("No quarter links found on the main page.")
        exit(1)
    
    all_records_combined = []
    # Step 2: For each quarter, determine total pages, scrape the records,
    for quarter, first_page_url in quarter_links.items():
        print(f"\nProcessing {quarter} from {first_page_url}")
        total_pages = get_total_pages(first_page_url)
        print(f"Total pages for {quarter}: {total_pages}")
        records = scrape_quarter(quarter, total_pages)
        # # Update - Added a new column 'Quarter', so that the data is easier to segregate based on Quarter
        for record in records:
            record["Quarter"] = quarter
        all_records_combined.extend(records)
        print(f"Total records for {quarter}: {len(records)}")
    
    # Save all records into a common CSV file with the quarter information
    save_to_csv(all_records_combined, "all_rental_records.csv")


Processing Q1 from http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page01.html
Total pages for Q1: 26
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page01.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page02.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page03.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page04.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page05.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page06.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page07.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page08.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page09.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page10.html ...
Scraping http://mlg.ucd.ie/modules/python/assignment1/rental/Q1-page11.html ...
Scraping h