In [2]:
%matplotlib notebook

# generic packages
import sys
import re, numpy as np, pandas as pd
from pprint import pprint
from matplotlib.ticker import PercentFormatter

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 1)

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 


# graphing, vis stuff
import matplotlib.pyplot as plt

# additional libraries
from collections import Counter



# 1 Region and Domain

# 2 Research Question

"Which city division in the City of Toronto spent the most money in 2021 (and how does that compare with the distribution of its approved operating budget for 2021)?"



# 3 Links

In [3]:
# competitive contracts - City of Toronto

CC_URL = "https://wx.toronto.ca/inter/pmmd/callawards.nsf/postedawards?OpenView"
CC_URL2 = "https://wx.toronto.ca/inter/pmmd/callawards.nsf/postedawards?OpenView&Start=3.8.1"

# non-competitive contracts - City of Toronto
NC_URL = "https://wx.toronto.ca/inter/pmmd/solesource.nsf/posted?OpenView&Start=1&ExpandView"
NC_URL2 = "https://wx.toronto.ca/inter/pmmd/solesource.nsf/posted?OpenView&Start=1000&ExpandView"
NC_URL3 = "https://wx.toronto.ca/inter/pmmd/solesource.nsf/posted?OpenView&Start=1999&ExpandView"



In [4]:
# WORK THROUGH ... https://realpython.com/beautiful-soup-web-scraper-python/#step-2-scrape-html-content-from-a-page

from bs4 import BeautifulSoup
import requests

urls = [CC_URL, CC_URL2, NC_URL, NC_URL2, NC_URL3]
pages = [requests.get(url) for url in urls]
soups = [BeautifulSoup(page.text, "html.parser") for page in pages]

In [5]:
tables_per_url = [soups[i].find_all('table') for i in range(len(soups))]




In [6]:
# next: try using get_tables function here: https://towardsdatascience.com/a-guide-to-scraping-html-tables-with-pandas-and-beautifulsoup-7fc24c331cf7

def get_table(table_html):
    
    rows = []
    for child in table_html.children:
        row = []
        for td in child:
            try:
                row.append(td.text.replace('\n', ''))
            except:
                continue
        if len(row) > 0:
            rows.append(row)

    df = pd.DataFrame(rows[1:], columns=rows[0])
    return df
    
ncc_dfs = [get_table(tables_per_url[i][8]) for i in range(2,5)]
ncc_all = pd.concat(ncc_dfs)

cc_dfs = [get_table(tables_per_url[i][7]) for i in range(0,2)]
cc_all = pd.concat(cc_dfs)


In [7]:
cc_money_rows = cc_all[cc_all["Contract Amount"].notnull()]
cc_dirty_rows = cc_all[cc_all["Contract Amount"].isnull()]

ncc_money_rows = ncc_all[ncc_all["Contract Amount $"].notnull()]

In [16]:
ncc_cleaned = ncc_money_rows[['Sole Source Reference','Client Division', 'Description','Vendor Name','Contract Amount $','Contract Date']]
ncc_cleaned = ncc_cleaned.rename(columns={"Sole Source Reference":"ref_id",
                                         "Client Division":"client_division",
                                         "Description":"description",
                                         "Vendor Name":"vendor",
                                         "Contract Amount $":"contract_value",
                                         "Contract Date":"date"})

cc_cleaned = cc_money_rows[['Call Number','Client Division/Contact', 'Description','Successful Bidder','Contract Amount','Date awarded']]
cc_cleaned = cc_cleaned.rename(columns={"Call Number":"ref_id",
                                         "Client Division/Contact":"client_division",
                                         "Description":"description",
                                         "Successful Bidder":"vendor",
                                         "Contract Amount":"contract_value",
                                         "Date awarded":"date"})

In [63]:
ncc_cleaned["type"] = "non-competitive"
cc_cleaned["type"] = "competitive"

all_contracts = pd.merge(ncc_cleaned, cc_cleaned, how="outer", on=["ref_id", 
                                                                  "client_division",
                                                                  "description",
                                                                  "vendor",
                                                                  "contract_value",
                                                                  "type",
                                                                  "date"])
# clean up NA values in contracts
to_NaN = [r'^\s*$', r'n\/a', r'N\/A', r'--', r'NA'] 

for i in to_NaN:
    all_contracts.contract_value = all_contracts.contract_value.replace(i, np.nan, regex=True)
    all_contracts.vendor = all_contracts.vendor.replace(i, np.nan, regex=True)

all_contracts.contract_value = all_contracts.contract_value.replace(r'Cancelled', np.nan, regex=False)
all_contracts = all_contracts[~all_contracts['contract_value'].isna()]
all_contracts['currency'] = "CAD"

# reformat to accommodate USD entry
all_contracts.contract_value = all_contracts.contract_value.replace(r'[\$,]', '', regex=True).str.strip()
usd_rows = all_contracts[all_contracts['contract_value'].str.contains('USD')]
all_contracts.at[2375,'currency'] = 'USD'
all_contracts['contract_value'] = all_contracts['contract_value'].str.replace(' USD','').str.strip()

# convert money column to float
all_contracts.contract_value = all_contracts.contract_value.astype('float')

# convert time column to year only
all_contracts['date'] = pd.DatetimeIndex(all_contracts['date']).year

# remove 2022 as it is incomplete
all_contracts = all_contracts[all_contracts["date"] != 2022]

all_contracts[all_contracts["type"] == "competitive"]

# fill in rows with incomplete 'client_division', based on ref_ID documents available here: https://wx.toronto.ca/inter/pmmd/callawards.nsf/postedawards?OpenView
corrections = [[2247, "Purchasing & Materials Management"],
[2368, "Purchasing & Materials Management"],
[2370, "Purchasing & Materials Management"],
[2379, "Purchasing & Materials Management"],
[2380, "Purchasing & Materials Management"],
[2384, "Parks, Forestry & Recreation"],
[2388, "Solid Waste Management Services"],
[2409, "Purchasing & Materials Management"],
[2420, "Purchasing & Materials Management"],
[2422, "Parks, Forestry & Recreation"],
[2423, "Parks, Forestry & Recreation"],
[2431, "Parks, Forestry & Recreation"],
[2433, "Parks, Forestry & Recreation"],
[2436, "Parks, Forestry & Recreation"]]

for i in corrections:
    idx = i[0]
    all_contracts.at[idx,"client_division"] = i[1]

# also delete duplicate 
all_contracts = all_contracts.drop(2434) 

# filter 2021 contracts only
contracts_2021 = all_contracts[all_contracts["date"] == 2021] 

In [69]:
# group by client_division and summarize

by_division = contracts_2021.groupby("client_division")["contract_value"].sum().sort_values(ascending=False)
by_division



client_division
Engineering & Construction Services - Capital Works Delivery    2.6e+08
Transportation Services                                         1.1e+08
Parks, Forestry & Recreation                                    8.2e+07
Toronto Water                                                   8.0e+07
Information & Technology                                        6.3e+07
Purchasing & Materials Management                               5.6e+07
Real Estate Services                                            5.0e+07
Accounting Services                                             4.4e+07
Fleet Services                                                  4.1e+07
Engineering & Construction Services - Engineering Services      3.9e+07
Facilities Design & Construction                                2.6e+07
Social Development, Finance & Administration                    2.0e+07
Fire Services                                                   1.9e+07
Solid Waste Management Services                 

In [74]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:

top10_spenders = by_division[:11]


fig1, ax1 = plt.subplots()
ax1.pie(top10_spenders, labels=top10_spenders.index, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

<IPython.core.display.Javascript object>

TypeError: bar() missing 1 required positional argument: 'height'

# 4 Image

# 5 Discussion