In [51]:
#Importing Dependencies
%matplotlib inline
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import utils
from functools import reduce
from sklearn.cluster import KMeans
import psycopg2
from sqlalchemy import create_engine
from config import DB_HOST, DB_USER, DB_PASS, DB_PORT

In [52]:
#Creating engine to access online databse
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/us_emissions_db')

In [53]:
#reading merged table and year averages table
merged_table = pd.read_sql_table('gdp_pop_sector_emissions', con=engine)
year_avg = pd.read_sql_table('year_averages', con=engine)
year_avg.columns = ['year', 'avgpop', 'avggdp', 'avgghg']

In [54]:
#displaying merged table and year table
display(merged_table.head())
display(year_avg.head())

Unnamed: 0,Country,State,Year,GDP,Population,Sector,allghg,CH4,CO2,N2O,F-Gas
0,US,AL,1997,37247870000.0,4367935,Agriculture,6.889241,3.392917,0.016045,4.028508,1.12829
1,US,AL,1998,39368060000.0,4404701,Agriculture,6.954451,3.137611,0.016269,3.75163,1.290703
2,US,AL,1999,41513660000.0,4430141,Agriculture,6.560038,3.074048,0.013043,3.880403,1.376542
3,US,AL,2000,42583370000.0,4452173,Agriculture,6.298967,2.959365,0.00242,3.600673,1.521127
4,US,AL,2001,43348580000.0,4467634,Agriculture,6.374233,2.89973,0.007422,3.399237,1.600098


Unnamed: 0,year,avgpop,avggdp,avgghg
0,1997,5346018.0,63360210000.0,19.7233
1,1998,5408904.0,65290910000.0,19.659627
2,1999,5471376.0,69399200000.0,19.800945
3,2000,5532596.0,72151530000.0,20.297083
4,2001,5587627.0,74771590000.0,19.961042


# Random Forest Regressor Model

In [55]:
#Creating target and features for Random Forest Regressor Model to be used on the Avg Table
X = year_avg[['avgpop','avggdp']]
y = year_avg['avgghg']

In [56]:
#Scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
#Train and test data
clf = RandomForestRegressor(random_state=1, n_estimators=50).fit(X_train, y_train)

print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.963617877979699
Testing Score: 0.6901170803933703


In [119]:
#Create function to predict ghg based on Population & GDP per sector & state
def rfr_model(state,sector):
    model_data = merged_table.loc[(merged_table['State']==state) & (merged_table['Sector']==sector)]
    X = model_data[['Population','GDP']]
    y = model_data['allghg']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = RandomForestRegressor(random_state=1, n_estimators=50).fit(X_train, y_train)
    print(f'Training Score: {clf.score(X_train, y_train)}')
    print(f'Testing Score: {clf.score(X_test, y_test)}')

In [121]:
rfr_model('NJ', 'Agriculture')

Training Score: 0.9493439960319076
Testing Score: 0.6027889163029333


