In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from typing import List
import requests

In [2]:
def data_retrieval(url: str) -> BeautifulSoup:
    """
    Retrieves data from the given URL and returns a BeautifulSoup object.
    """
    data = requests.get(url)
    soup = BeautifulSoup(data.text, 'html.parser')
    return soup

# Function to create a DataFrame from a BeautifulSoup table
def create_soup_df(table: BeautifulSoup) -> pd.DataFrame:
    """
    Creates a DataFrame from the given BeautifulSoup table.
    """
    header = table.find_all('th')
    titles = [title.text.strip() for title in header]
    df = pd.DataFrame(columns=titles)
    return df

# Function to find all rows in a BeautifulSoup table
def find_rows(table: BeautifulSoup) -> List[BeautifulSoup]:
    """
    Finds all rows in the given table.
    """
    return table.find_all('tr')

In [3]:
soup = data_retrieval('https://en.wikipedia.org/wiki/List_of_population_centres_in_Saskatchewan')

In [4]:
table = soup.find_all('table')[0]

In [5]:
pop_df = create_soup_df(table)

In [6]:
column_indices_to_select = [1, 2, 3]
pop_df = pop_df.iloc[:, column_indices_to_select].copy()

In [7]:
pop_df = pop_df.rename(columns={'Population centre[2]': 'population_centre', 'Size group[2]': 'classification', 'Population (2021)[2]': 'population'})

In [8]:
rows = find_rows(table)

In [9]:
for row in rows[1:]:
    row_data = row.find_all('td')
    single_row_data = [data.text.strip() for data in row_data]
    # Insert a duplicate of the second element into the third position if necessary
    indices=[1,2,3]
    extracted_info = [single_row_data[x] for x in indices]
    length = len(pop_df)
    pop_df.loc[length] = extracted_info

In [10]:
pop_df.to_csv('sask_pop_density.csv', index=False)