In [1]:
# GOOGLE COLAB SETUP
# from google.colab import drive
# drive.mount('/content/drive')

This is the beginning of the notebook.

ADM4142-A Fundamentals of Data science <br>
The goal of this notebook is to retrieve and stage the source datasets into the format used in the dimensional model for analysis.

This notebook generates the Location_dimension of the weather/tourism/economy data frame.




Rationale: <br>

Location dimension:

    Location_key (PK)
    Date_key (FK)
    location: string
    population: int

Location_key: integer enumeration of entries <br>
Date_key: key associated to Jan. 1st of the year corresponding to each entry.
(Only the year value is useful). <br>
location: 5 values (Canada, Alberta, British Columbia, Ontario, Quebec) <br>
population: annual population associated with the location and date above. <br>



In [2]:
# imports
import pandas as pd

In [3]:
# import datasets

# URLs of the datasets
urls = [
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/1710000501_databaseLoadingData-annualCanada-population.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/1710000501_databaseLoadingData-annualBritishColumbia-population.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/1710000501_databaseLoadingData-annualAlberta-population.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/1710000501_databaseLoadingData-annualOntario-population.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/1710000501_databaseLoadingData-annualQuebec-population.csv'
]

# Initialize an empty list to store DataFrames
dataframes = []

# Load, clean, and filter each dataset
for url in urls:
    # Load dataset
    df = pd.read_csv(url)

    # Remove apostrophes from all cells
    df = df.replace({"\"": ""}, regex=True)

    # Filter rows based on conditions
    df_filtered = df[(df['REF_DATE'] >= 1990) & (df['REF_DATE'] <= 2023) & (df['Age group'] == 'All ages')]

    # Append the filtered DataFrame to the list
    dataframes.append(df_filtered)

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Gender,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1990,Canada,2016A000011124,Total - gender,All ages,Persons,249,units,0,v466668,1.1.1,27691138.0,,,,0
1,1991,Canada,2016A000011124,Total - gender,All ages,Persons,249,units,0,v466668,1.1.1,28037420.0,,,,0
2,1992,Canada,2016A000011124,Total - gender,All ages,Persons,249,units,0,v466668,1.1.1,28371264.0,,,,0
3,1993,Canada,2016A000011124,Total - gender,All ages,Persons,249,units,0,v466668,1.1.1,28684764.0,,,,0
4,1994,Canada,2016A000011124,Total - gender,All ages,Persons,249,units,0,v466668,1.1.1,29000663.0,,,,0


Now, generate the key mapping to map the REF_DATE value to the Jan. 1st date of the corresponding year in the dataset date.csv. (date.csv is the finished Date dimension from another notebook.)


In [4]:
date_url = 'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/dimension/date.csv'
date_df = pd.read_csv(date_url)

# Filter for January 1st entries of each year
jan_1st_entries = date_df[(date_df['month'] == 1) & (date_df['day'] == 1) & (date_df['year'].between(1990, 2023))]

# Create the mapping of year to Date_key
date_key_map = pd.Series(jan_1st_entries['Date_key'].values,index=jan_1st_entries['year']).to_dict()


Data transformation

In [5]:
# Assume date_key_map is a dictionary mapping each year to its corresponding Date_key (FK) in 'date.csv'
# Example: date_key_map = {1990: 1, 1991: 2, ..., 2023: 34}
#date_key_map = {}  # This needs to be defined based on the 'date.csv' file

# Mapping locations to a consistent format
location_map = {
    'Canada': 'Canada',
    'British Columbia': 'British Columbia',
    'Alberta': 'Alberta',
    'Ontario': 'Ontario',
    'Quebec': 'Quebec'
}

# Adding Location_key and Date_key to the DataFrame
#combined_df['Location_key'] = combined_df.groupby('GEO').ngroup() + 1  # Integer enumeration for locations
combined_df['Location_key'] = range(1, len(combined_df) + 1)
combined_df['Date_key'] = combined_df['REF_DATE'].apply(lambda x: date_key_map.get(x))

# Selecting and renaming columns to match the final dataset requirements
final_df = combined_df[['Location_key', 'Date_key', 'GEO', 'VALUE']]
final_df.columns = ['Location_key', 'Date_key', 'location', 'population']

# Note: You may need to adjust the mapping and transformation logic based on the actual structure of 'date.csv'


Testing

In [6]:
final_df.tail(25)

Unnamed: 0,Location_key,Date_key,location,population
145,146,3288,Quebec,7323250.0
146,147,3653,Quebec,7356951.0
147,148,4019,Quebec,7396014.0
148,149,4384,Quebec,7441305.0
149,150,4749,Quebec,7485488.0
150,151,5114,Quebec,7535483.0
151,152,5480,Quebec,7581467.0
152,153,5845,Quebec,7631901.0
153,154,6210,Quebec,7692400.0
154,155,6575,Quebec,7761614.0


In [8]:
# export to csv
#final_df.to_csv('location.csv', index=False)
