This is the beginning of the notebook.

##ADM4142-A Fundamentals of Data science <br>
The goal of this notebook is to retrieve and stage the source datasets into the format used in the dimensional model for analysis.

This notebook generates the Economy_dimension of the weather/tourism/economy data frame.

Code below gets the data from the datasets/urls cand puts them in the database, just need to update the urls before using it or else you might get an error cuz the urls will expire. Only missing mean tempurature

Iterate over URLs and create the database.

In [None]:
import pandas as pd


In [None]:
# Load the date dimension with a focus on the entries for the 1st day of each month between 1990 and 2023
date_url = 'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/dimension/date.csv'
date_df = pd.read_csv(date_url)
# Ensure we're focusing on entries from 1990-2023 and the 1st day of each month
date_df_filtered = date_df[(date_df['year'].between(1990, 2023)) & (date_df['day'] == 1)]


In [None]:
# List of URLs for weather data
urls = [
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_banff_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_calgary_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_edmonton_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_montreal_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_ottawa_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_quebec_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_toronto_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_vancouver_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_victoria_normal_monthly.csv',
    'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/data/weatherstats_whistler_normal_monthly.csv',
    # Add more URLs here as needed
]

In [None]:
# Initialize an empty DataFrame to store concatenated results
result_df = pd.DataFrame()

# Iterate through each URL to process and transform the data
for url in urls:
    # Load dataset
    df = pd.read_csv(url)

    # Extract location from the filename in URL
    city = url.split('/')[-1].split('_')[1]

    # Convert 'date' column to datetime format to extract year and month
    df['date'] = pd.to_datetime(df['date'])

    # Filter data for years 1990 to 2023
    df = df[df['date'].dt.year.between(1990, 2023)]

    # Calculate average temperature
    df['avg_temperature_v'] = (df['max_temperature_v'] + df['min_temperature_v']) / 2

    # Fill missing values
    #df.fillna(df.mean(), inplace=True)

    # Add 'location' column
    df['location'] = city

    # Join with the date dimension on year and month for the 1st day of each month
    df = df.merge(date_df_filtered, left_on=[df['date'].dt.year, df['date'].dt.month], right_on=['year', 'month'], how='left')

    # Keep only the required columns and rename them accordingly
    df = df[['Date_key', 'date', 'max_temperature_v', 'min_temperature_v', 'avg_temperature_v', 'precipitation_v', 'snow_v', 'location']]

    # Append the processed DataFrame to the result
    result_df = pd.concat([result_df, df], ignore_index=True)


# Add a placeholder for 'Location_key' (to be filled later)
result_df['Location_key'] = ''

# Sorting by location and date (oldest to newest)
result_df.sort_values(by=['location', 'Date_key'], inplace=True)

# Reset weather key to start from beginning
result_df.reset_index(drop=True, inplace=True)
# Add a 'Weather_key' as an enumeration from 1
#result_df['Weather_key'] = result_df.index + 1
result_df.insert(0, 'Weather_key', range(1, 1 + len(result_df)))

# Final column order adjustment
final_columns = ['Weather_key', 'Location_key', 'Date_key', 'date', 'max_temperature_v', 'min_temperature_v', 'avg_temperature_v', 'precipitation_v', 'snow_v', 'location']
result_df = result_df[final_columns]

In [None]:
result_df.head()

Populate the Location_key column with the corresponding province associated to the city in 'location'.

In [None]:
# Load the location dimension if not already loaded
location_url = 'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/dimension/location.csv'
location_df = pd.read_csv(location_url)

# Load the date dimension if not already loaded
date_url = 'https://raw.githubusercontent.com/noobstang/cscsi4142-project-datasets/master/dimension/date.csv'
date_df = pd.read_csv(date_url)

# Filter date_df for entries that correspond to January 1st of each year to simplify the join
date_df_filtered = date_df[date_df['day'] == 1]


In [None]:
# Mapping from city to province
city_to_province = {
    "banff": "Alberta",
    "calgary": "Alberta",
    "edmonton": "Alberta",
    "montreal": "Quebec",
    "quebec": "Quebec",
    "ottawa": "Ontario",
    "toronto": "Ontario",
    "vancouver": "British Columbia",
    "victoria": "British Columbia",
    "whistler": "British Columbia"
}

# Replace city names in weather_df with their corresponding province names
result_df['location_province'] = result_df['location'].map(city_to_province)


In [None]:
result_df.head()

In [None]:
date_df_filtered.head(50)


In [None]:
location_df.head()

In [None]:
# Before joining, ensure that 'location' in location_df refers to the province. If not, you may need to adjust location_df accordingly.

# Make a copy of the filtered DataFrame to avoid SettingWithCopyWarning
date_df_filtered = date_df_filtered.copy()

# Convert 'date_iso' column to datetime and extract the year, correctly using a copy to avoid the warning
#date_df_filtered['year'] = pd.to_datetime(date_df_filtered['date_iso']).dt.year

# Extract the year for each Date_key in date_df for the join
#date_df_filtered['year'] = pd.to_datetime(date_df_filtered['date_iso']).dt.year
date_df_filtered['date_iso'] = pd.to_datetime(date_df_filtered['date_iso'])

#result_df['date'] = pd.to_datetime(result_df['date'])
result_df['year'] = pd.to_datetime(result_df['date']).dt.year



In [None]:
# add year column to location
#location_df[year] =
location_df = location_df.copy()

#location_df = location_df.merge(date_df_filtered[['year']], left_on=['Date_key'], right_on=['Date_key'], how='left')

# Merge the DataFrames using the "Date_key" attribute
location_merged_df = pd.merge(location_df, date_df[['Date_key', 'year']], on='Date_key', how='left')

location_merged_df.head()

In [None]:
# Convert 'year' columns to pandas datetime format
#result_df['year'] = pd.to_datetime(result_df['year'], format='%Y')
#location_merged_df['year'] = pd.to_datetime(location_merged_df['year'], format='%Y')

result_df.head()
#location_merged_df.head()

In [None]:
# Merge the DataFrames using the specified attributes
merged_temp_df = pd.merge(result_df, location_merged_df, left_on=['location_province', 'year'], right_on=['location', 'year'], how='left')


# Perform the join with location_df to get the Location_key, matching based on 'location' and 'year'
#location_merged_df['year'] = pd.to_datetime(location_merged_df['year'])

#result_df_merge = pd.merge(result_df, location_merged_df[['Location_key']], left_on=['location_province', 'year'], right_on=['location', 'year'], how='left')




# Perform the join with date_df to maintain the Date_key for each month
#result_df = result_df.merge(date_df_filtered[['Date_key', 'date_iso', 'year']], left_on=['date', 'year'], right_on=['date_iso', 'year'], how='left')

# Join weather data with date_df_filtered to get a simplified 'Date_key' that corresponds to January 1st of each year
#result_df = result_df.merge(date_df_filtered[['year', 'Date_key']], on='year', how='left')

# Join on 'location' to get the 'Location_key', using 'Date_key' as an additional join condition if necessary
# This assumes that 'location' in location_df is already set to provinces and is compatible with our mapping
#result_df = result_df.merge(location_df[['Location_key', 'location', 'Date_key']], left_on=['location_province', 'Date_key'], right_on=['location', 'Date_key'], how='left')
merged_temp_df.head()

In [None]:
# Overwrite the values of the 'Location_key' column in result_df
result_df['Location_key'] = merged_temp_df['Location_key_y']


result_df.head()

In [None]:
# Select and rename the columns as per the final requirement
final_weather_df = result_df[['Weather_key', 'Location_key', 'Date_key', 'max_temperature_v', 'min_temperature_v', 'avg_temperature_v', 'precipitation_v', 'snow_v', 'location']].copy()

# Export the final DataFrame
#final_weather_df.to_csv('weather_final.csv', index=False)


Test results

In [None]:
final_weather_df.head(30)

In [None]:
# Assuming df is your DataFrame
filtered_df = location_merged_df[location_merged_df['Location_key'] >= 60]

# Get the first 10 entries
first_10_entries = filtered_df.head(50)

# Display the first 10 entries
print(first_10_entries)


Testing


In [None]:
# Export the final DataFrame
#final_weather_df.to_csv('weather_final.csv', index=False)