<br><br>

## Connecting to the Database

---

In [1]:
# Import statements
import pandas as pd
import oracledb

In [2]:
# Connecting to the database
dsn = oracledb.makedsn("localhost", 1522, service_name="stu")
connection = oracledb.connect(user="ora_msan99", password="a85263259", dsn=dsn)

In [3]:
# Extract data from the database tables 
Province=[]
cur = connection.cursor()
for row in cur.execute("select * from Province"):
    Province.append(row)
cur.close()

Industry=[]
cur = connection.cursor()
for row in cur.execute("select * from Industry"):
    Industry.append(row)  
cur.close()

IndustryMapping=[]
cur = connection.cursor()
for row in cur.execute("select * from IndustryMapping"):
    IndustryMapping.append(row)    
cur.close()

Emissions=[]
cur = connection.cursor()
for row in cur.execute("select * from Emissions"):
    Emissions.append(row)   
cur.close()


In [4]:
# Store the data from the database as dataframes 
emissions = pd.DataFrame(Emissions,columns=['province','IndustryName', 'IOIC', 'year',  'emissions', 'industry_code'])

industry = pd.DataFrame(IndustryMapping,columns=['IndustryCode', 'IndustryName', 'NAICS', 'IOIC', 'Durable'])

population = pd.DataFrame(Province,columns=['province', 'year', 'Population'])

employment = pd.DataFrame(Industry,columns=['Good_Type', 'province', 'year','No_Employed'])

<br><br>

## Tidying the Data

---

In [5]:
# TIDY THE EMISSIONS DATASET AND CALCULATE EMISSIONS PER INDUSTRY TYPE
# join emissions and industry together
emissions_industry = pd.merge(emissions, industry, left_on="industry_code", right_on="IndustryCode")

# filter out canada in the emissions_industry dataset
emissions_industry = emissions_industry[emissions_industry["province"] != "Canada"]

# drop duplicate columns
emissions_industry = emissions_industry[['province','year','emissions','industry_code','Durable']]

# create a new column called ProvinceYearDurable in EmissionsIndustry
emissions_industry["ProvinceYearDurable"] = emissions_industry["province"] + emissions_industry["year"].astype(str) + emissions_industry["Durable"].astype(str)

# remove households, put in separate dataframe
emissions_household = emissions_industry[emissions_industry["industry_code"].isin(['99999', '99998'])]
emissions_industry = emissions_industry[~emissions_industry["industry_code"].isin(['99999', '99998'])]

emissions_household_tidy = emissions_household.drop(columns=["industry_code"])

# group by Province and Year to calculate the total  emissions for each household 
grouped_emissions_hh = pd.DataFrame(emissions_household_tidy.groupby(['province', 'year'])['emissions'].sum())
grouped_emissions_hh = grouped_emissions_hh.reset_index()
grouped_emissions_hh["ProvinceYear"] = grouped_emissions_hh["province"] + grouped_emissions_hh["year"].astype(str)

# group by Province, Year, and Durable to calculate the total emissions for each good type
grouped_emissions = pd.DataFrame(emissions_industry.groupby(['province', 'year', 'Durable'])['emissions'].sum())
grouped_emissions = grouped_emissions.reset_index()
grouped_emissions["ProvinceYearDurable"] = grouped_emissions["province"] + grouped_emissions["year"].astype(str) + grouped_emissions["Durable"].astype(str)

<br>

In [12]:
# TIDY THE EMPLOYMENT DATASET AND CALCULATE EMISSIONS PER EMPLOYEE 
# remove rows where Good_Type = Manufacturing or province = Canada 
employment_reduced = employment[~((employment["Good_Type"] == "Manufacturing") | (employment["province"] == "Canada"))]

# create a binary variable from Good_Type
durable_mapping = {'Durables': 1, 'Non-durables': 0}

# define a function to assign durable values based on Good_Type
def assign_durable(row):
  good_type_value = row['Good_Type']
  return durable_mapping.get(good_type_value, -1)  # Assign -1 for missing values (optional)

# create a new DataFrame with desired columns and apply function
employment_tidy = employment_reduced[['province', 'year', 'No_Employed']].assign(durable=employment.apply(assign_durable, axis=1))

# create a new column called ProvinceYearDurable in employment_tidy
employment_tidy["ProvinceYearDurable"] = employment_tidy["province"] + employment_tidy["year"].astype(str) + employment_tidy["durable"].astype(str)

# join the grouped_emissions with employment_tidy, drop column used to merge the dataframes
emissions_industry_employment = pd.merge(grouped_emissions, employment_tidy, on="ProvinceYearDurable")
emissions_industry_employment = emissions_industry_employment.drop(columns="ProvinceYearDurable")

# drop duplicate columns 
emissions_industry_employment.drop(columns=["province_y", "year_y", "durable"], inplace=True)
emissions_industry_employment.rename(columns={'province_x': 'province'}, inplace=True)
emissions_industry_employment.rename(columns={'year_x': 'year'}, inplace=True)

# calculate the emissions per employee by dividing emissions by No_Employed 
# No_Employed will first be multiplied by 1000 as the current data is scaled down by 1000
emissions_industry_employment["no_employed_rescaled"] = emissions_industry_employment["No_Employed"] * 1000
emissions_industry_employment["emissions_per_employee"] = emissions_industry_employment["emissions"]/emissions_industry_employment["no_employed_rescaled"]

emissions_industry_employment.head()

Unnamed: 0,province,year,Durable,emissions,No_Employed,no_employed_rescaled,emissions_per_employee
0,Alberta,2012,0,27513,50.6,50600.0,0.543735
1,Alberta,2012,1,4249,88.8,88800.0,0.047849
2,Alberta,2013,0,28619,59.9,59900.0,0.47778
3,Alberta,2013,1,4921,82.8,82800.0,0.059432
4,Alberta,2014,0,27933,56.5,56500.0,0.494389


<br>

In [None]:
# TIDY THE POPULATION DATASET AND CALCULATE EMISSIONS PER CAPITA FOR HOUSEHOLDS
# remove rows where province is Canada 
population_reduced = population[population['province'] != "Canada"]
population_reduced.head()

# make a Province Year key
population_reduced["ProvinceYear"] = population_reduced["province"] + population_reduced["year"].astype(str)

# join the grouped_emissions_hh with population_tidy, drop key, rename count to Population
emissions_household_population = pd.merge(grouped_emissions_hh, population_reduced, on="ProvinceYear")
emissions_household_population.drop(columns="ProvinceYear", inplace=True)
emissions_household_population.head()

# calculate the emissions per capita by dividing emissions by population 
emissions_household_population["emissions_per_capita"] = emissions_household_population["emissions"]/emissions_household_population["Population"]

In [11]:
emissions_household_population.head()

Unnamed: 0,province_x,year_x,emissions,province_y,year_y,Population,emissions_per_capita
0,Alberta,2012,18637,Alberta,2012,3897746,0.004781
1,Alberta,2013,19916,Alberta,2013,4008421,0.004969
2,Alberta,2014,20041,Alberta,2014,4103702,0.004884
3,Alberta,2015,18754,Alberta,2015,4163048,0.004505
4,Alberta,2016,19015,Alberta,2016,4207139,0.00452
