In [None]:
# GOOGLE COLAB SETUP
# from google.colab import drive
# drive.mount('/content/drive')

This is the beginning of the notebook.

##ADM4142-A Fundamentals of Data science <br>
The goal of this notebook is to retrieve and stage the source datasets into the format used in the dimensional model for analysis.

This notebook generates the Economy_dimension of the weather/tourism/economy data frame.

###Economy dimension as follows:

    Economy_key (PK)
    Location_key (FK)
    Date_key (FK)
    GDP: float
    GDP per capita: float
    GDP growth rate: float

* Economy_key (PK): integer enumeration of entries from oldest to latest<br>
* Location_key (FK): location for which this data applies. <br>
* Date_key (FK): year for which the data applies. Redundant as this is already * present in the corresponding Location_key entry. <br>
* GDP: adjusted annual GDP, in Chained (2017) CAD. <br>
* GDP per capita: adjusted annual GDP per capita, calculated with GDP/population <br>
* GDP growth rate: percentage change from previous year. <br>

      E.g. 1990 would be calculated using 1990 data / 1989 data * 100%



In [None]:
import pandas as pd


Load and combine datasets

In [None]:
# URLs of the datasets
urls = [
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/3610040201_databaseLoadingData-annualProvincialGDP-allIndustries.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/3610040201_databaseLoadingData-annualProvincialGDP-airTransport.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/3610040201_databaseLoadingData-annualProvincialGDP-groundTransport.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/3610040201_databaseLoadingData-annualProvincialGDP-waterTransport.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/3610040201_databaseLoadingData-annualProvincialGDP-TransportationWarehousing.csv'
]

# Initialize an empty list to store DataFrames
dataframes = []

# Load each dataset, apply transformations, and append to the list
for url in urls:
    df = pd.read_csv(url)
    dataframes.append(df)

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)


In [None]:
# Aggregate the combined_df set with GDP stats for all of Canada. This is for analytical purposes in later stages.

# Group the data by 'REF_DATE' and 'North American Industry Classification System (NAICS)', and sum up the 'VALUE'
canada_aggregated = combined_df.groupby(['REF_DATE', 'North American Industry Classification System (NAICS)'])['VALUE'].sum().reset_index()

# Add a 'GEO' column with the value 'Canada' to the aggregated data
canada_aggregated['GEO'] = 'Canada'

# Rearrange columns to match the original DataFrame structure
canada_aggregated = canada_aggregated[['REF_DATE', 'GEO', 'North American Industry Classification System (NAICS)', 'VALUE']]

# Append the aggregated data to the original DataFrame
combined_df = pd.concat([combined_df, canada_aggregated], ignore_index=True)

combined_df.tail(15)

Data filtering and transformation

In [None]:
# Filter rows based on the "GEO" column
filtered_df = combined_df[combined_df['GEO'].isin(['Canada', 'Alberta', 'British Columbia', 'Ontario', 'Quebec'])]

# Apply transformations to the "North American Industry Classification System (NAICS)" column
naics_transformations = {
    'All industries (except cannabis sector) [T020]': 'All industries',
    'Transportation and warehousing [48-49]': 'All transportation',
    'Air transportation [481]': 'Air transportation',
    'Transit, ground passenger and scenic and sightseeing transportation [48Z]': 'Ground transportation',
    'Water transportation [483]': 'Water Transportation'
}

filtered_df['North American Industry Classification System (NAICS)'] = filtered_df['North American Industry Classification System (NAICS)'].replace(naics_transformations)


In [None]:
filtered_df.tail(30)

##Filtering, transforming, and feature engineering.

Take the needed columns with the correct column headings.

Engineer the 'GDP per capita' and 'GDP growth rate' features by calculating with the population values from the location data.

Load the location and date dimensions to prepare for joining

In [None]:
# Load the location dimension
location_url = 'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/dimension/location.csv'
location_df = pd.read_csv(location_url)

# Load the date dimension
date_url = 'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/dimension/date.csv'
date_df = pd.read_csv(date_url)



Join the Economy dimension with the Date dimension by Jan. 1st of the year value.

In [None]:
# Filter for years 1997 to 2022. These are the only data available to join for the Economy dimension

# Filter date_df for January 1st entries if not already done
jan_1st_entries = date_df[(date_df['month'] == 1) & (date_df['day'] == 1)]

# Join combined_df with jan_1st_entries to get 'Date_key'
# Assuming 'REF_DATE' in combined_df accurately reflects the year
#combined_df = combined_df.merge(jan_1st_entries[['year', 'Date_key']], left_on='REF_DATE', right_on='year', how='left')
combined_df = filtered_df.merge(jan_1st_entries[['year', 'Date_key']], left_on='REF_DATE', right_on='year', how='left')



After joining the Date dimension, use the Date_key for the corresponding year value, join the economy data with the location data.

In [None]:
# Assuming location_df and combined_df are already loaded and prepared

# Adjusting the join to correctly match 'GEO' with 'location' and use 'Date_key'
combined_df = combined_df.merge(location_df, left_on=['GEO', 'Date_key'], right_on=['location', 'Date_key'], how='left')



In [None]:
# Calculate GDP per capita
#combined_df['GDP per capita'] = combined_df['VALUE'] / combined_df['population']

# Adjust GDP per capita calculation considering GDP is in millions of dollars
# Note: Assuming 'population' column is in number of individuals
combined_df['GDP per capita'] = (combined_df['VALUE'] * 1e6) / combined_df['population']


# Ensure the DataFrame is sorted by 'GEO', 'North American Industry Classification System (NAICS)', and 'REF_DATE' for accurate calculations
combined_df.sort_values(by=['GEO', 'North American Industry Classification System (NAICS)', 'REF_DATE'], inplace=True)

# Calculate the GDP growth rate, grouped by 'GEO' and 'Sector' (NAICS)
combined_df['GDP growth rate'] = combined_df.groupby(['GEO', 'North American Industry Classification System (NAICS)'])['VALUE'].pct_change() * 100

# For the first entry of each group, fill the NaN values with 0 (no growth rate for the first year available)
combined_df['GDP growth rate'] = combined_df.groupby(['GEO', 'North American Industry Classification System (NAICS)'])['GDP growth rate'].transform(lambda x: x.fillna(0))


Final tuning and export

In [None]:
# Ensure 'Economy_key' is added as an integer enumeration from 1, if not already done
combined_df.reset_index(drop=True, inplace=True)
combined_df['Economy_key'] = combined_df.index + 1

# Select and rename columns to match the final dataset requirements
final_economy_df = combined_df[['Economy_key', 'Location_key', 'Date_key', 'year', 'North American Industry Classification System (NAICS)', 'VALUE', 'GDP per capita', 'GDP growth rate']]
final_economy_df.columns = ['Economy_key', 'Location_key', 'Date_key', 'year', 'Sector', 'GDP', 'GDP per capita', 'GDP growth rate']

final_economy_df.head(40)


Export

In [None]:
# Export to CSV
final_economy_df.to_csv('economy.csv', index=False)