In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List

In [2]:
def data_retrieval(url: str) -> BeautifulSoup:
    """
    Retrieves data from the given URL and returns a BeautifulSoup object.
    """
    data = requests.get(url)
    soup = BeautifulSoup(data.text, 'html.parser')
    return soup

# Function to create a DataFrame from a BeautifulSoup table
def create_soup_df(table: BeautifulSoup) -> pd.DataFrame:
    """
    Creates a DataFrame from the given BeautifulSoup table.
    """
    header = table.find_all('th')
    titles = [title.text.strip() for title in header]
    df = pd.DataFrame(columns=titles)
    return df

# Function to find all rows in a BeautifulSoup table
def find_rows(table: BeautifulSoup) -> List[BeautifulSoup]:
    """
    Finds all rows in the given table.
    """
    return table.find_all('tr')

In [3]:
soup = data_retrieval('https://en.wikipedia.org/wiki/List_of_Saskatchewan_general_elections')

In [4]:
table = soup.find_all('table')[2]

In [5]:
voting_df = create_soup_df(table)
voting_df

Unnamed: 0,Year,Seats,Conservative[B],Liberal,NDP[A],SK Party,Independent,Other parties,Unnamed: 9,Unnamed: 10,...,1982,1986,1991,1995,1999,2003,2007,2011,2016,2020


In [6]:
column_indices_to_select = [0, 1, 2, 3, 4, 5, 6, 7]
voting_df = voting_df.iloc[:, column_indices_to_select].copy()

In [7]:
voting_df

Unnamed: 0,Year,Seats,Conservative[B],Liberal,NDP[A],SK Party,Independent,Other parties


In [8]:
rows = find_rows(table)

In [9]:
# Iterate through each row in the ratings table
for row in rows[3:]:
    row_data = row.find_all('td')
    single_row_data = [data.text.strip() for data in row_data]
    # Insert a duplicate of the second element into the third position if necessary
    indices=[0,1,2,4,6,8,10,12]
    extracted_info = [single_row_data[x] for x in indices]
    length = len(voting_df)
    voting_df.loc[length] = extracted_info

In [10]:
voting_df = voting_df.drop(columns='Other parties').rename(columns={'Seats': 'total_seats', 'Conservative[B]': 'Conservative',
                                                                    'NDP[A]': 'NDP'})

In [11]:
voting_df

Unnamed: 0,Year,total_seats,Conservative,Liberal,NDP,SK Party,Independent
0,,25,9,16,,,
1,,41,14,27,,,
2,,53,8,45,,,
3,,59,7,51,,,1.0
4,,63,2,45,,,6.0
5,,63,3,51[D],,,7.0
6,,63,24[C],28,,,5.0
7,,55,0,50,5.0,,
8,,52,0,38,10.0,,4.0
9,,52,0,5,47.0,,


In [12]:
voting_df['Conservative'] = voting_df['Conservative'].replace('24[C]', '24')
voting_df['Liberal'] = voting_df['Liberal'].replace('51[D]', '51')

In [13]:
years = [1905, 1908, 1912, 1917, 1921, 1925, 1929, 1934, 1938, 1944, 1948, 1952, 1956, 1960, 1964, 1967, 1971, 1975, 1978, 1982, 1986, 1991, 1995, 1999, 2003, 2007, 2011, 2016, 2020]
voting_df['Year'] = years

In [14]:
voting_1971_2020 = voting_df[voting_df['Year'] >= 1971]

In [15]:
voting_1971_2020.reset_index(drop='index', inplace=True)

In [18]:
voting_1971_2020.to_csv('voting_sask.csv', index=False)