In [1]:
import pandas as pd
import os
import statsmodels.api as sm
from sklearn import linear_model
import numpy as np

## This notebooks explores the relationship between a state's number of vehicle registrations, population, GDP, GDP by industry,  and C02 emissions on it's energy consumption within different regions of the US. 

### The goal is to model energy consuption for state's within different regions of the US by using the data listed above. With this model we can make energy consuption predictions and understand what leads to high energy consuption.

### The contents of the notebook include
- #### Data Gathering
    - read in the dataframes that have been cleaned by data_gathering_and_cleaning notebook
- #### Data analysis 
    - create a multiple linear regression model for energy consuption
- #### Conclusion
    - Discuss what we discovered and draw conclusions
    
Note: If there are no files in the Data/cleaned diretory, you will need to run the 'data_gathering_and_cleaning" notebook to clwan and write out the files to that directory.

### Data Gathering
#### This section of the notebooks reads in the data files and stores them im pandas dataframes.
The dataframes frames in this section all have columns of represting years ranging from [1967-2020] and rows for each state.

In [2]:
csv_path = os.path.join(os.getcwd(), "data/cleaned/csv")
excel_path = os.path.join(os.getcwd(), "data/cleaned/excel")

In [3]:
#Read in all datasets here 

vehicle_registration_df = pd.read_csv(os.path.join(csv_path, "vehicle_registrations_by_state.csv"))
energy_consumption_per_real_gdp_df = pd.read_csv(os.path.join(csv_path, "energy_consumption_per_real_gdp.csv"))
current_dollar_gdp_df = pd.read_csv(os.path.join(csv_path, "Current_dollar_GDP.csv")) #in millions
total_consuption_df = pd.read_csv(os.path.join(csv_path, "total_consuption.csv")) #in billion Btu
industy_gdp_by_state_df = pd.read_csv(os.path.join(csv_path, "industy_gdp_by_state.csv"))
total_population_df = pd.read_csv(os.path.join(csv_path, "total_population.csv"))
real_gdp_df = pd.read_csv(os.path.join(csv_path, "real_GDP.csv")) #in millions
co2_emissions_df = pd.read_excel(os.path.join(excel_path, "co2_emissions.xlsx"))
tavg_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TAVG.csv"))
wind_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "DYHF.csv"))
tmax_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TMAX.csv"))
tmin_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TMIN.csv"))
precip_df = pd.read_csv(os.path.join(csv_path + '/NOA', "PRCP.csv"))
snow_df = pd.read_csv(os.path.join(csv_path + '/NOA', "SNOW.csv"))

In [4]:
#Use the columns that are in each dataframe after columns with empty values have been dropped. 
columns_to_evaluate = list(set(vehicle_registration_df.columns).intersection(total_population_df.columns).intersection(total_consuption_df.columns).intersection(real_gdp_df.columns).intersection(industy_gdp_by_state_df.columns).intersection(co2_emissions_df.columns).intersection(tavg_df.columns).intersection(wind_df.columns).intersection(tmax_df.columns).intersection(tmin_df.columns).intersection(precip_df.columns).intersection(snow_df.columns))
columns_to_evaluate

['2016',
 '2015',
 '2010',
 '2008',
 '2011',
 '2018',
 '2019',
 '2007',
 '2014',
 'Unnamed: 0',
 '2009',
 '2012',
 '2013',
 '2017']

In [5]:
#ensure each column we are going to evaluate has the same number of values 
for col in columns_to_evaluate:
    if(not (len(vehicle_registration_df[col]) == len(total_consuption_df[col]) == len(total_population_df[col]) == len(real_gdp_df[col])== len(industy_gdp_by_state_df[col]) == len(co2_emissions_df[col]) == len(tavg_df[col])== len(wind_df[col])== len(tmax_df[col])== len(tmin_df[col])== len(precip_df[col])== len(snow_df[col]))):
        print("unequal entries for column:" + col)

In [6]:
west = ["California", "Hawaii", "Nevada", "Colorado", "Idaho", "Montana", "Utah", "Wyoming", "Oregon", "Washington", "Alaska"]
south_west = ["New Mexico", "Arizona", "Texas", "Oklahoma"]
mid_west = ["Iowa", "Kansas", "Missouri", "Nebraska", "North Dakota", "South Dakota", "Illinois", "Indiana", "Michigan", "Minnesota", "Ohio", "Wisconsin"]
south_east = ["Alabama", "Florida", "Georgia", "Mississippi", "South Carolina", "Arkansas", "Louisiana", "Delaware", "Kentucky", "Maryland", "North Carolina", "Tennessee", "Virginia", "West Virginia"]
north_east = ["New Jersey", "New York", "Pennsylvania", "Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont"]

In [7]:
west_abr = ["CA", "HI", "NV", "CO", "ID", "MT", "UT", "WY", "OR", "WA", "AK"]
southwest_abr = ["NM", "AZ", "TX", "OK"]
midwest_abr = ["IA", "KS", "MO", "NE", "ND", "SD", "IL", "IN", "MI", "MN", "OH", "WI"]
southeast_abr = ["AL", "FL", "GA", "MS", "SC", "AR", "LA", "DE", "KY", "MD", "NC", "TN", "VA", "WV"]
northeast_abr = ["NJ", "NY", "PA", "CT", "ME", "MA", "NH", "RI", "VT"]

### Data Analysis

#### This section of the notebooks creates a multiple linear regression model for a state's energy consuption.


##### In the model summary each variable is represented by the following 
- x1: Vehicle regisrations
- x2: Population
- x3: GDP per capita
- x4: Industry GDP per capita 
- x5: C02 emissions
- x6: Average tempature
- x7: Average wind speed
- x8: Maximum tempature
- x9: Minimum tempature
- x10: Total precipitation
- x11: Total snow fall

There are some other values in the summary that give us a good indication as to how well our model fits energy consuption such at the r squared value and F statistic.

In [9]:
# loop through the data frames and add each value to data_point_pairs array. 
# The data_point_pairs array will be the [vehicle registration, population, GDP, Industry GDP] value for each year and each state
# The total_consumption_vals will be the cooresponding energy consuption value 
# for the [vehicle registration, population, GDP, Industry GDP, C02 emissions, average tempature, average wind speed, max temperature, min tempature, total precipitation, total snowfall] data point
west_data_point_pairs = []
west_total_consumption_vals = []

southwest_data_point_pairs = []
southwest_total_consumption_vals = []

midwest_data_point_pairs = []
midwest_total_consumption_vals = []

southeast_data_point_pairs = []
southeast_total_consumption_vals = []

northeast_data_point_pairs = []
northeast_total_consumption_vals = []
for col in columns_to_evaluate:
    for i in range(0,50):
        pair = [vehicle_registration_df.iloc[i][col], total_population_df.iloc[i][col], real_gdp_df.iloc[i][col], industy_gdp_by_state_df.iloc[i][col], co2_emissions_df.iloc[i][col], tavg_df.iloc[i][col],wind_df.iloc[i][col],tmax_df.iloc[i][col],tmin_df.iloc[i][col],precip_df.iloc[i][col],snow_df.iloc[i][col]]
        if(total_consuption_df.iloc[i]['State'] in west_abr) :
            west_data_point_pairs.append(pair)
            west_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in southwest_abr) :
            southwest_data_point_pairs.append(pair)
            southwest_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in midwest_abr) :
            midwest_data_point_pairs.append(pair)
            midwest_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in southeast_abr) :
            southeast_data_point_pairs.append(pair)
            southeast_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in northeast_abr) :
            northeast_data_point_pairs.append(pair)
            northeast_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 

### Multiple linear regression model for energy consuption of states in the western region of US 
#### California, Hawaii, Nevada, Colorado, Idaho, Montana, Utah, Wyoming, Oregon, Washington, Alaska

In [10]:
X = west_data_point_pairs
y = west_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)


# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion:  [623040.32644597]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.995
Method:,Least Squares,F-statistic:,3094.0
Date:,"Sat, 04 Dec 2021",Prob (F-statistic):,4.3000000000000005e-164
Time:,17:41:32,Log-Likelihood:,-2060.8
No. Observations:,154,AIC:,4144.0
Df Residuals:,143,BIC:,4177.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0233,0.007,3.208,0.002,0.009,0.038
x2,177.1333,17.453,10.149,0.000,142.635,211.632
x3,0.3589,0.314,1.142,0.255,-0.262,0.980
x4,-6.2872,1.869,-3.364,0.001,-9.981,-2.593
x5,490.5079,322.373,1.522,0.130,-146.724,1127.740
x6,1.158e+05,3.17e+05,0.365,0.716,-5.12e+05,7.43e+05
x7,2.32e+04,5794.074,4.003,0.000,1.17e+04,3.46e+04
x8,-2.892e+04,1.59e+05,-0.182,0.856,-3.43e+05,2.85e+05
x9,-1.108e+05,1.58e+05,-0.699,0.486,-4.24e+05,2.02e+05

0,1,2,3
Omnibus:,15.764,Durbin-Watson:,2.43
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.331
Skew:,0.769,Prob(JB):,0.000172
Kurtosis:,3.581,Cond. No.,302000000.0


### Multiple linear regression model for energy consuption of states in the south western region of US 
#### New Mexico, Arizona, Texas, Oklahoma

In [10]:
X = southwest_data_point_pairs
y = southwest_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion for 
vehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 
 [525843.69655571]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.995
Method:,Least Squares,F-statistic:,2269.0
Date:,"Tue, 30 Nov 2021",Prob (F-statistic):,1.29e-58
Time:,18:24:34,Log-Likelihood:,-803.04
No. Observations:,56,AIC:,1616.0
Df Residuals:,51,BIC:,1626.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0141,0.022,0.635,0.528,-0.030,0.059
x2,-205.7570,54.021,-3.809,0.000,-314.208,-97.306
x3,11.1064,0.933,11.908,0.000,9.234,12.979
x4,-4.3540,2.605,-1.671,0.101,-9.584,0.876
x5,2490.9157,572.492,4.351,0.000,1341.591,3640.240

0,1,2,3
Omnibus:,1.631,Durbin-Watson:,1.743
Prob(Omnibus):,0.442,Jarque-Bera (JB):,1.259
Skew:,0.367,Prob(JB):,0.533
Kurtosis:,2.999,Cond. No.,113000.0


### Multiple linear regression model for energy consuption of states in the mid western region of US 
#### Iowa, Kansas, Missouri, Nebraska, North Dakota, South Dakota, Illinois, Indiana, Michigan, Minnesota, Ohio, Wisconsin

In [11]:
X = midwest_data_point_pairs
y = midwest_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121
predictions = lm.predict([[4610845, 699, 55911, 9717, 121]])
print("Predicted energy consumpion for \nvehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 \n", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion for 
vehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 
 [446014.2046418]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.981
Model:,OLS,Adj. R-squared (uncentered):,0.98
Method:,Least Squares,F-statistic:,1674.0
Date:,"Tue, 30 Nov 2021",Prob (F-statistic):,4.35e-138
Time:,18:24:35,Log-Likelihood:,-2357.2
No. Observations:,168,AIC:,4724.0
Df Residuals:,163,BIC:,4740.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0337,0.017,2.012,0.046,0.001,0.067
x2,297.2743,37.067,8.020,0.000,224.081,370.467
x3,0.6464,0.646,1.001,0.318,-0.629,1.922
x4,-7.1175,4.918,-1.447,0.150,-16.828,2.593
x5,-683.2683,659.274,-1.036,0.302,-1985.087,618.550

0,1,2,3
Omnibus:,2.032,Durbin-Watson:,2.354
Prob(Omnibus):,0.362,Jarque-Bera (JB):,1.928
Skew:,0.261,Prob(JB):,0.381
Kurtosis:,2.956,Cond. No.,131000.0


### Multiple linear regression model for energy consuption of states in the south eastern region of US 
#### Alabama, Florida, Georgia, Mississippi, South Carolina, Arkansas, Louisiana, Delaware, Kentucky, Maryland, North Carolina, Tennessee, Virginia, West Virginia

In [12]:
X = southeast_data_point_pairs
y = southeast_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121
predictions = lm.predict([[4610845, 699, 55911, 9717, 121]])
print("Predicted energy consumpion for \nvehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 \n", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion for 
vehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 
 [1389339.54799597]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.937
Model:,OLS,Adj. R-squared (uncentered):,0.935
Method:,Least Squares,F-statistic:,566.1
Date:,"Tue, 30 Nov 2021",Prob (F-statistic):,1.9599999999999999e-112
Time:,18:24:35,Log-Likelihood:,-2874.8
No. Observations:,196,AIC:,5760.0
Df Residuals:,191,BIC:,5776.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0317,0.021,-1.544,0.124,-0.072,0.009
x2,-104.7517,54.275,-1.930,0.055,-211.807,2.304
x3,7.3677,1.101,6.690,0.000,5.195,9.540
x4,-27.3487,6.842,-3.997,0.000,-40.844,-13.853
x5,1.146e+04,794.447,14.426,0.000,9893.714,1.3e+04

0,1,2,3
Omnibus:,92.23,Durbin-Watson:,1.72
Prob(Omnibus):,0.0,Jarque-Bera (JB):,493.859
Skew:,1.742,Prob(JB):,5.75e-108
Kurtosis:,9.952,Cond. No.,111000.0


### Multiple linear regression model for energy consuption of states in the north eastern region of US 
#### New Jersey, New York, Pennsylvania, Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont

In [13]:
X = northeast_data_point_pairs
y = northeast_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121
predictions = lm.predict([[4610845, 699, 55911, 9717, 121]])
print("Predicted energy consumpion for \nvehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 \n", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion for 
vehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 
 [120164.43573636]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.99
Model:,OLS,Adj. R-squared (uncentered):,0.99
Method:,Least Squares,F-statistic:,2412.0
Date:,"Tue, 30 Nov 2021",Prob (F-statistic):,2.3999999999999998e-119
Time:,18:24:36,Log-Likelihood:,-1714.1
No. Observations:,126,AIC:,3438.0
Df Residuals:,121,BIC:,3452.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0209,0.012,1.769,0.079,-0.002,0.044
x2,505.5517,27.304,18.516,0.000,451.496,559.607
x3,-4.4268,0.402,-11.019,0.000,-5.222,-3.631
x4,-2.0219,2.235,-0.905,0.367,-6.446,2.403
x5,-40.5176,616.668,-0.066,0.948,-1261.375,1180.340

0,1,2,3
Omnibus:,21.162,Durbin-Watson:,2.2
Prob(Omnibus):,0.0,Jarque-Bera (JB):,59.583
Skew:,0.557,Prob(JB):,1.15e-13
Kurtosis:,6.18,Cond. No.,172000.0


### Conclusion

#### This section of the notebooks discusses the results