## DATA-604

In [30]:
import pandas as pd
import sqlalchemy as sq

In [31]:
# Load the datasets into individual dataframe
avg_rent_df = pd.read_csv("datasets/average_rent_municipality.csv")
avg_rent_df = avg_rent_df.rename(
    columns={
        "CSD": "municipality",
        "Period": "year",
        "Rental Unit Type": "rental_type",
        "OriginalValue": "value",
    }
)
avg_rent_df = avg_rent_df.drop("CSDUID", axis=1)
avg_rent_df = avg_rent_df.drop("IndicatorSummaryDescription", axis=1)
avg_rent_df = avg_rent_df.drop("UnitOfMeasure", axis=1)
avg_rent_df["municipality"] = avg_rent_df["municipality"].str.lower()
avg_rent_df["rental_type"] = avg_rent_df["rental_type"].str.lower()

avg_rent_df.head()

# Load others

Unnamed: 0,municipality,year,rental_type,value
0,red deer,1987,1 - bedroom,377.0
1,red deer,1988,1 - bedroom,379.0
2,red deer,1989,1 - bedroom,388.0
3,red deer,1990,1 - bedroom,413.0
4,red deer,1991,1 - bedroom,429.0


In [32]:
# Load the datasets into individual dataframe
oil_production_df = pd.read_csv("datasets/oil_production_municipality.csv")
oil_production_df = oil_production_df.rename(
    columns={
        "CSD": "municipality",
        "Period": "year",
        "OriginalValue": "value",
    }
)
oil_production_df = oil_production_df.drop("CSDUID", axis=1)
oil_production_df = oil_production_df.drop("IndicatorSummaryDescription", axis=1)
oil_production_df = oil_production_df.drop("UnitOfMeasure", axis=1)
oil_production_df["municipality"] = oil_production_df["municipality"].str.lower()
oil_production_df.head()




Unnamed: 0,municipality,year,value
0,drumheller,2003,3138.0
1,drumheller,2004,3291.5
2,drumheller,2005,5311.0
3,drumheller,2006,5141.0
4,drumheller,2007,5477.0


In [33]:
# Load the datasets into individual dataframe
natural_gas_production_df = pd.read_csv("datasets/natural_gas_production_municipality.csv")
natural_gas_production_df = natural_gas_production_df.rename(
    columns={
        "CSD": "municipality",
        "Period": "year",
        "OriginalValue": "value",
    }
)
natural_gas_production_df = natural_gas_production_df.drop("CSDUID", axis=1)
natural_gas_production_df = natural_gas_production_df.drop("IndicatorSummaryDescription", axis=1)
natural_gas_production_df = natural_gas_production_df.drop("UnitOfMeasure", axis=1)
natural_gas_production_df["municipality"] = natural_gas_production_df["municipality"].str.lower()
natural_gas_production_df.head()



Unnamed: 0,municipality,year,value
0,drumheller,2003,104493.2
1,drumheller,2004,105486.4
2,drumheller,2005,130930.0
3,drumheller,2006,128564.0
4,drumheller,2007,124354.0


In [34]:
# Load the datasets into individual dataframe
natural_gas_price_df = pd.read_csv("datasets/natural_gas_price.csv", encoding='utf-16', sep='\t').T
natural_gas_price_df = natural_gas_price_df.reset_index()
natural_gas_price_df = natural_gas_price_df.rename(columns={'index': 'year'})
cleaned_natural_gas_data = []
for index, row in natural_gas_price_df.iterrows():
    if index == 0:
        continue
    year = int(row[0]) if not pd.isna(row[0]) else 0
    price = row[1] if not pd.isna(row[1]) else 0.0
    cleaned_natural_gas_data.append({'year': year, 'price': price})
natural_gas_price_df = pd.DataFrame(cleaned_natural_gas_data)
natural_gas_price_df.head()

Unnamed: 0,year,price
0,2002,3.95
1,2003,6.16
2,2004,6.31
3,2005,8.23
4,2006,6.43


In [35]:
# Load the datasets into individual dataframe
census_employment_df = pd.read_csv("datasets/census_employment_municipality.csv")
census_employment_df = census_employment_df.rename(
    columns={
        "CSD": "municipality",
        "Period": "year",
        "OriginalValue": "rate",
        "Gender": "gender",
        "IndicatorSummaryDescription": "category"
    }
)
census_employment_df = census_employment_df.drop("CSDUID", axis=1)
census_employment_df = census_employment_df.drop("UnitOfMeasure", axis=1)
census_employment_df["category"] = census_employment_df["category"].str.lower()
census_employment_df["gender"] = census_employment_df["gender"].str.lower()
census_employment_df["municipality"] = census_employment_df["municipality"].str.lower()
census_employment_df.head()

Unnamed: 0,municipality,year,category,gender,rate
0,sedgewick,1981,participation rate,female,0.389
1,sedgewick,1986,participation rate,female,0.397
2,sedgewick,1991,participation rate,female,0.576
3,sedgewick,1996,participation rate,female,0.431
4,sedgewick,2001,participation rate,female,0.587


In [36]:
# Load the datasets into individual dataframe
well_count_df = pd.read_csv("datasets/well_count_municipality.csv")
well_count_df = well_count_df.rename(
    columns={
        "CSD": "municipality",
        "Period": "year",
        "OriginalValue": "value",
    }
)
well_count_df = well_count_df.drop("CSDUID", axis=1)
well_count_df = well_count_df.drop("IndicatorSummaryDescription", axis=1)
well_count_df = well_count_df.drop("UnitOfMeasure", axis=1)
well_count_df["municipality"] = well_count_df["municipality"].str.lower()

well_count_df.head()

Unnamed: 0,municipality,year,value
0,drumheller,2003,10.0
1,drumheller,2004,35.0
2,drumheller,2005,21.0
3,drumheller,2006,17.0
4,drumheller,2007,11.0


In [37]:
# create a set of municipalites
dfs_with_municipalities = [avg_rent_df, oil_production_df, natural_gas_production_df, census_employment_df, well_count_df]

municipalities_df = (
    pd.concat([df['municipality'] for df in dfs_with_municipalities])
    .drop_duplicates()
    .reset_index(drop=True)
    .to_frame(name='municipality')
)

municipalities_df = municipalities_df.sort_values(by='municipality').reset_index(drop=True)
municipalities_df['id'] = range(1, len(municipalities_df) + 1)
municipalities_df.head()

Unnamed: 0,municipality,id
0,acadia no. 34,1
1,acme,2
2,airdrie,3
3,alberta beach,4
4,alexander 134,5


In [38]:
# Load the datasets into individual dataframe
oil_price_df = pd.read_csv("datasets/oil_price.csv")
oil_price_df = oil_price_df.rename(
    columns={
        "Date": "date",
        "Value": "value",
    }
)
oil_price_df = oil_price_df.drop("Series", axis=1)
oil_price_df = oil_price_df.drop("labels", axis=1)
oil_price_df.head()

Unnamed: 0,date,value
0,2005-01-01,29.42
1,2005-02-01,28.44
2,2005-03-01,36.5
3,2005-04-01,31.02
4,2005-05-01,27.46


In [39]:
if 'municipality' in avg_rent_df.columns:
    avg_rent_df = avg_rent_df.merge(
        municipalities_df,
        on='municipality',
        how='left'
    )
    avg_rent_df = avg_rent_df.rename(columns={'id': 'municipality_id'})
    avg_rent_df = avg_rent_df.drop("municipality", axis=1)

if 'municipality' in oil_production_df.columns:
    oil_production_df = oil_production_df.merge(
        municipalities_df,
        on='municipality',
        how='left'
    )
    oil_production_df = oil_production_df.rename(columns={'id': 'municipality_id'})
    oil_production_df = oil_production_df.drop("municipality", axis=1)

if 'municipality' in natural_gas_production_df.columns:
    natural_gas_production_df = natural_gas_production_df.merge(
        municipalities_df,
        on='municipality',
        how='left'
    )
    natural_gas_production_df = natural_gas_production_df.rename(columns={'id': 'municipality_id'})
    natural_gas_production_df = natural_gas_production_df.drop("municipality", axis=1)

if 'municipality' in census_employment_df.columns:
    census_employment_df = census_employment_df.merge(
        municipalities_df,
        on='municipality',
        how='left'
    )
    census_employment_df = census_employment_df.rename(columns={'id': 'municipality_id'})
    census_employment_df = census_employment_df.drop("municipality", axis=1)
    
if 'municipality' in well_count_df.columns:
    well_count_df = well_count_df.merge(
        municipalities_df,
        on='municipality',
        how='left'
    )
    well_count_df = well_count_df.rename(columns={'id': 'municipality_id'})
    well_count_df = well_count_df.drop("municipality", axis=1)

municipalities_df = municipalities_df.rename(
    columns={
        "municipality": "name"
    }
)

In [52]:
# define sql connection here
usersname = 'root'
password = 'root'
database_name = 'project_604'
port = 3306

In [54]:
engine = sq.create_engine(
    f"mysql+mysqlconnector://{usersname}:{password}@localhost:{port}/{database_name}"
)

In [56]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `municipalities_rent`;
"""

create_municipalities_rent_table_query = """
CREATE TABLE IF NOT EXISTS `municipalities_rent` (
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `municipality_id` BIGINT NOT NULL,
    `year` YEAR NOT NULL,
    `rental_type` ENUM('2 - bedroom', '3 - bedroom', 'bachelor', '1 - bedroom') NOT NULL,
    `value` FLOAT NOT NULL,
    PRIMARY KEY (`id`)
);
"""

with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_municipalities_rent_table_query))
    connection.commit()


In [58]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `municipalities_oil_production`;
"""

create_municipalities_oil_production_query = """
CREATE TABLE IF NOT EXISTS `municipalities_oil_production` (
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `municipality_id` BIGINT NOT NULL,
    `year` YEAR NOT NULL,
    `value` FLOAT NOT NULL,
    PRIMARY KEY (`id`)
);
"""



with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_municipalities_oil_production_query))
    connection.commit()

In [60]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `municipalities_natural_gas_production`;
"""

create_municipalities_natural_gas_production_query = """
CREATE TABLE IF NOT EXISTS `municipalities_natural_gas_production`(
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `municipality_id` BIGINT NOT NULL,
    `year` YEAR NOT NULL,
    `value` FLOAT NOT NULL,
    PRIMARY KEY (`id`)
);
"""



with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_municipalities_natural_gas_production_query))
    connection.commit()



In [62]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `municipalities_well_count`;
"""

create_municipalities_well_count_query = """
CREATE TABLE IF NOT EXISTS `municipalities_well_count` (
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `municipality_id` BIGINT NOT NULL,
    `year` YEAR NOT NULL,
    `value` INT NOT NULL,
    PRIMARY KEY (`id`)
);
"""

with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_municipalities_well_count_query))
    connection.commit()

In [64]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `natural_gas_price`;
"""

create_natural_gas_price_query = """
CREATE TABLE IF NOT EXISTS `natural_gas_price` (
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `year` YEAR NOT NULL,
    `price` FLOAT NOT NULL,
    PRIMARY KEY (`id`)
);
"""

with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_natural_gas_price_query))
    connection.commit()

In [66]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `census_employment_rate`;
"""

create_census_employment_rate_query = """
CREATE TABLE IF NOT EXISTS `census_employment_rate`(
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `municipality_id` BIGINT NOT NULL,
    `year` YEAR NOT NULL,
    `gender` ENUM('male', 'both', 'female', '') NOT NULL,
    `category` ENUM('unemployment rate', 'employment rate', 'participation rate', '') NOT NULL,
    `rate` FLOAT NOT NULL,
    PRIMARY KEY (`id`)
);
"""



with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_census_employment_rate_query))
    connection.commit()

In [68]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `oil_price`;
"""

create_oil_price_query = """
CREATE TABLE IF NOT EXISTS `oil_price` (
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `date` DATE NOT NULL,
    `value` FLOAT NOT NULL,
    PRIMARY KEY (`id`)
);
"""

with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_oil_price_query))
    connection.commit()

In [70]:
# Drop and recreate the tables
drop_table_query = """
DROP TABLE IF EXISTS `municipalities`;
"""

create_municipalities_query = """
CREATE TABLE IF NOT EXISTS `municipalities` (
    `id` BIGINT NOT NULL AUTO_INCREMENT,
    `name` VARCHAR(200) NOT NULL,
    PRIMARY KEY (`id`)
);
"""

with engine.connect() as connection:
    connection.execute(sq.text(drop_table_query))
    connection.execute(sq.text(create_municipalities_query))
    connection.commit()

In [76]:
# Load the dataframe into the database
avg_rent_df.to_sql('municipalities_rent', engine, if_exists='append', index=False, chunksize = 1000)
oil_production_df.to_sql('municipalities_oil_production', engine, if_exists='append', index=False, chunksize = 1000)
natural_gas_production_df.to_sql('municipalities_natural_gas_production', engine, if_exists='append', index=False, chunksize = 1000)
well_count_df.to_sql('municipalities_well_count', engine, if_exists='append', index=False, chunksize = 1000)
census_employment_df.to_sql('census_employment_rate', engine, if_exists='append', index=False, chunksize = 1000)
natural_gas_price_df.to_sql('natural_gas_price', engine, if_exists='append', index=False, chunksize = 1000)
oil_price_df.to_sql('oil_price', engine, if_exists='append', index=False, chunksize = 1000)
municipalities_df.to_sql('municipalities', engine, if_exists='append', index=False,chunksize=1000)

-1

In [95]:
#Query to get the top 5 municipalities with the lowest female employment rate for each year

query4 = '''SELECT municipalities.name AS municipality, gender, category, year
            FROM census_employment_rate
            JOIN municipalities ON (census_employment_rate.municipality_id = municipalities.id)
            WHERE gender = 'female' AND category = 'employment rate'
            ORDER BY year, rate ASC
            LIMIT 5;
            '''
lowest_employment_rate_female = pd.read_sql_query(query4,engine)
lowest_employment_rate_female.head()





Unnamed: 0,municipality,gender,category,year
0,silver sands,female,employment rate,1981
1,rosalind,female,employment rate,1981
2,west cove,female,employment rate,1981
3,duncan's 151a,female,employment rate,1981
4,tall cree 173,female,employment rate,1981


In [83]:
#Query to get the total volume of oil for each municipality during the COVID Pandemic (2020 - 2022)

query = '''SELECT municipalities.name, SUM(value) AS total_volume
            FROM municipalities_oil_production
            JOIN municipalities ON (municipalities_oil_production.municipality_id = municipalities.id)
            WHERE year IN ('2020', '2021','2022')
            GROUP BY municipalities.name; 
            '''
total_oil_volume = pd.read_sql_query(query,engine)
total_oil_volume.head()



Unnamed: 0,name,total_volume
0,acadia no. 34,6081.8
1,athabasca county,788043.8
2,barrhead county no. 11,46613.8
3,beaver county,547377.2
4,big lakes county,7207276.0


In [87]:
#Query to get the total volume of oil for each municipality for the years 2014 - 16

query1 = '''SELECT municipalities.name, SUM(value) AS total_volume
            FROM municipalities_oil_production
            JOIN municipalities ON (municipalities_oil_production.municipality_id = municipalities.id)
            WHERE year IN ('2014', '2015', '2016')
            GROUP BY municipalities.name;
            '''
total_oil_crisis_volume = pd.read_sql_query(query1,engine)
total_oil_crisis_volume.head()

Unnamed: 0,name,total_volume
0,acadia no. 34,4370.6
1,athabasca county,10914.8
2,barrhead county no. 11,90373.8
3,beaver county,840989.8
4,big lakes county,9508549.0


In [89]:
#Query to get the total volume of natural gas for each municipality during the COVID Pandemic (2020 - 2022)

query2 = '''SELECT municipalities.name, SUM(value) AS total_gas
            FROM municipalities_natural_gas_production
            JOIN municipalities ON (municipalities_natural_gas_production.municipality_id = municipalities.id)
            WHERE year IN ('2020', '2021', '2022')
            GROUP BY municipalities.name;
            '''
total_oil_pandemic_volume = pd.read_sql_query(query2,engine)
total_oil_pandemic_volume.head()



Unnamed: 0,name,total_gas
0,acadia no. 34,23654.8
1,athabasca county,425975.4
2,barrhead county no. 11,538336.4
3,beaver county,1111449.0
4,big lakes county,1128556.0


In [93]:
#Query to get the top 5 municipalities by well count

query3 = '''SELECT municipalities.name, SUM(value) AS total_wells 
            FROM municipalities_well_count
            JOIN municipalities ON (municipalities_well_count.municipality_id = municipalities.id)
            GROUP BY municipalities.name
            ORDER BY total_wells desc
            LIMIT 5
            '''
well_count = pd.read_sql_query(query3,engine)
well_count.head()


Unnamed: 0,name,total_wells
0,wood buffalo,60186.0
1,cypress county,25916.0
2,greenview no. 16,24994.0
3,bonnyville no. 87,24646.0
4,newell county,20794.0
