In [176]:
#Importing Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
from numpy import random
import tensorflow as tf
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import utils
from functools import reduce
from sklearn.cluster import KMeans
import psycopg2
from sqlalchemy import create_engine
from config import DB_HOST, DB_USER, DB_PASS, DB_PORT

In [177]:
#Creating engine to access online databse
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/us_emissions_db')

In [178]:
#reading merged table and year averages table
merged_table = pd.read_sql_table('gdp_pop_sector_emissions', con=engine)
year_avg = pd.read_sql_table('year_averages', con=engine)
year_avg.columns = ['year', 'avgpop', 'avggdp', 'avgghg']

In [179]:
#displaying merged table and year table
display(merged_table.head())
display(year_avg.head())

Unnamed: 0,index,Country,State,Year,GDP,Population,Sector,allghg,CH4,CO2,N2O,F-Gas,outcomes
0,0,US,AL,1997,37247870000.0,4367935,Agriculture,6.889241,3.392917,0.016045,4.028508,1.12829,0
1,1,US,AL,1998,39368060000.0,4404701,Agriculture,6.954451,3.137611,0.016269,3.75163,1.290703,1
2,2,US,AL,1999,41513660000.0,4430141,Agriculture,6.560038,3.074048,0.013043,3.880403,1.376542,0
3,3,US,AL,2000,42583370000.0,4452173,Agriculture,6.298967,2.959365,0.00242,3.600673,1.521127,0
4,4,US,AL,2001,43348580000.0,4467634,Agriculture,6.374233,2.89973,0.007422,3.399237,1.600098,1


Unnamed: 0,year,avgpop,avggdp,avgghg
0,1997,5346018.0,63360210000.0,19.7233
1,1998,5408904.0,65290910000.0,19.659627
2,1999,5471376.0,69399200000.0,19.800945
3,2000,5532596.0,72151530000.0,20.297083
4,2001,5587627.0,74771590000.0,19.961042


In [229]:
# Encode State, Sector, and Year into Labels
le = LabelEncoder()
encoded_table = merged_table.copy()
encoded_table['StateEncoded'] = le.fit_transform(encoded_table['State'])
# encoded_table['Year'] = le.fit_transform(encoded_table['Year'])
encoded_table['SectorEncoded'] = le.fit_transform(encoded_table['Sector'])
# encoded_table_no_lucf = encoded_table.loc[encoded_table['Sector'] != 'Land-Use Change and Forestry']
encoded_table = encoded_table.copy()
encoded_table

Unnamed: 0,index,Country,State,Year,GDP,Population,Sector,allghg,CH4,CO2,N2O,F-Gas,outcomes,StateEncoded,SectorEncoded
0,0,US,AL,1997,3.724787e+10,4367935,Agriculture,6.889241,3.392917,0.016045,4.028508,1.128290,0,1,0
1,1,US,AL,1998,3.936806e+10,4404701,Agriculture,6.954451,3.137611,0.016269,3.751630,1.290703,1,1,0
2,2,US,AL,1999,4.151366e+10,4430141,Agriculture,6.560038,3.074048,0.013043,3.880403,1.376542,0,1,0
3,3,US,AL,2000,4.258337e+10,4452173,Agriculture,6.298967,2.959365,0.002420,3.600673,1.521127,0,1,0
4,4,US,AL,2001,4.334858e+10,4467634,Agriculture,6.374233,2.899730,0.007422,3.399237,1.600098,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13459,13459,US,WY,2014,1.827150e+10,583159,Waste,0.449731,0.000000,0.000000,0.000000,0.000000,1,50,11
13460,13460,US,WY,2015,1.651612e+10,586389,Waste,0.456765,0.000000,0.000000,0.000000,0.000000,1,50,11
13461,13461,US,WY,2016,1.516490e+10,585243,Waste,0.463380,0.000000,0.000000,0.000000,0.000000,1,50,11
13462,13462,US,WY,2017,1.597636e+10,579994,Waste,0.469695,0.000000,0.000000,0.000000,0.000000,1,50,11


# Random Forest Regressor Model

In [244]:
#Creating target and features for Random Forest Regressor Model to be used on the Avg Table
# X = year_avg[['avgpop','avggdp']]
# y = year_avg['avgghg']
X = encoded_table[['StateEncoded','SectorEncoded','GDP', 'Population', 'allghg']]
y = encoded_table['outcomes']

In [245]:
#Scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y)
# scaler = StandardScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [246]:
#Train and test data
clf = RandomForestRegressor(random_state = 42, n_estimators=200).fit(X_train, y_train)

print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')



Training Score: 0.8715572277852635
Testing Score: 0.06323928274139812




In [247]:
#Create function to predict ghg based on Population & GDP per state
def rfr_model(state, sector):
    model_data = encoded_table.loc[(encoded_table['State']==state) & (encoded_table['Sector']==sector)]
    X = model_data[['StateEncoded','SectorEncoded','GDP', 'Population', 'allghg']]
    y = model_data['outcomes']
    scaler = StandardScaler().fit(X)
#     X_test_scaled = scaler.transform(X)
    print(f'Testing Score: {clf.score(X, y)}')

In [251]:
rfr_model('PA', 'Agriculture')

Testing Score: 0.7082226495726496


