In [None]:
# import a library for working with tabular data, pandas
# we are importing it using the shorthand "pd" to more easily call it's functions

import pandas as pd

In [None]:
# read the data in as a python object (dataframe) we can work with
# I added underscores to the file name before uploading - computers do not like white space!
# I am setting the dtype to string because the data is mixed so it needs
# to know how to interpret it. You ideally should set for each column if working with the data extensively in Python

# NOTE this cell may take some time to execute

canada_data = pd.read_csv("Canada_DB_2024.csv", dtype=str)

In [None]:
# check how many rows are in your dataframe
canada_data.shape[0]

In [None]:
# optional - set the default max columns and rows to "none" 
# this will allow you to display as many columns & rows as you'd like
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# preview first 10 rows
canada_data.head(10)

In [None]:
# filter for only those cases where STCITY = Toronto
# be sure to assign any outputs to a new variable. Otherwise, these
# changes will not be saved as a new object you can run further
# analysis on!

# CHANGE FILTER AS NEEDED - UPDATE COLUMN AND FILTER PARAMETERS

toronto_data = canada_data[canada_data["STCITY"] == "Toronto"]

# check size of new dataframe to confirm this has been applied 
toronto_data.shape[0]

In [None]:
# preview your new dataframe
toronto_data.head(10)

In [None]:
# additional filters can be applied at the same time
# if applying a numerical filter, you will need to change that particular
# column's data type to numeric

# for example, changing LOCEMP to numeric:

canada_data["LOCEMP"] = pd.to_numeric(canada_data["LOCEMP"], errors='coerce')

# filtering for city AND employee size of greater than 200 at the same time 

toronto_data_size = canada_data[(canada_data["STCITY"] == "Toronto") & (canada_data["LOCEMP"] > 200)]

# check size of new dataframe to confirm this has been applied 
toronto_data_size.shape[0]

In [None]:
# list all columns in our new dataframe by exporting them to a list called "variables"
variables = toronto_data.columns.tolist()
print(variables)

In [None]:
# show a sampling of values from a particular column to confirm it
# contains what we think it does

print(toronto_data["LOCEMP"].head())  

In [None]:
# grab only specific columns to keep

# list of columns to keep
# CHANGE AS NEEDED
selected_columns = ["CONAME", "STADDR", "SUITE", "STCITY", "LOCEMP" ]

# create a new DataFrame with only selected columns
final_toronto = toronto_data[selected_columns]

# display the new DataFrame
print(final_toronto.head(10))

In [None]:
# print to csv
# your csv will now be available from your Jupyter Notebooks home page
# select the checkbox beside the filename, and "download" from the top menu bar
final_toronto.to_csv('torontodata.csv')