In [1]:
import pandas as pd
import os
import statsmodels.api as sm
from sklearn import linear_model
import numpy as np

## This notebooks explores the relationship between a state's number of vehicle registrations, population, GDP per capita, GDP per capita by industry,  C02 emissions, average yearly tempature, average yearly windspeed, minimum yearly tempature, maximim yearly tempature, total yearly precipitation , and total yearly snowfall on it's energy consumption within different regions of the US. 

### The goal is to model energy consuption for state's within different regions of the US by using the data listed above. With this model we can make energy consuption predictions and understand what leads to high energy consuption.

### The contents of the notebook include
- #### Data Gathering
    - read in the dataframes that have been cleaned by data_gathering_and_cleaning notebook
- #### Data analysis 
    - create a multiple linear regression model for energy consuption
- #### Conclusion
    - Discuss what we discovered and draw conclusions
    
Note: If there are no files in the Data/cleaned diretory, you will need to run the 'data_gathering_and_cleaning" notebook to clwan and write out the files to that directory.

### Data Gathering
#### This section of the notebooks reads in the data files and stores them im pandas dataframes.
The dataframes frames in this section all have columns of represting years ranging from [1967-2020] and rows for each state.

In [2]:
csv_path = os.path.join(os.getcwd(), "data/cleaned/csv")
excel_path = os.path.join(os.getcwd(), "data/cleaned/excel")

In [3]:
#Read in all datasets here 

vehicle_registration_df = pd.read_csv(os.path.join(csv_path, "vehicle_registrations_by_state.csv"))
energy_consumption_per_real_gdp_df = pd.read_csv(os.path.join(csv_path, "energy_consumption_per_real_gdp.csv"))
current_dollar_gdp_df = pd.read_csv(os.path.join(csv_path, "Current_dollar_GDP.csv")) #in millions
total_consuption_df = pd.read_csv(os.path.join(csv_path, "total_consuption.csv")) #in billion Btu
industy_gdp_by_state_df = pd.read_csv(os.path.join(csv_path, "industy_gdp_by_state.csv"))
total_population_df = pd.read_csv(os.path.join(csv_path, "total_population.csv"))
real_gdp_df = pd.read_csv(os.path.join(csv_path, "real_GDP.csv")) #in millions
co2_emissions_df = pd.read_excel(os.path.join(excel_path, "co2_emissions.xlsx"))
tavg_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TAVG.csv"))
wind_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "DYHF.csv"))
tmax_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TMAX.csv"))
tmin_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TMIN.csv"))
precip_df = pd.read_csv(os.path.join(csv_path + '/NOA', "PRCP.csv"))
snow_df = pd.read_csv(os.path.join(csv_path + '/NOA', "SNOW.csv"))

In [4]:
#Use the columns that are in each dataframe after columns with empty values have been dropped. 
columns_to_evaluate = list(set(vehicle_registration_df.columns).intersection(total_population_df.columns).intersection(total_consuption_df.columns).intersection(real_gdp_df.columns).intersection(industy_gdp_by_state_df.columns).intersection(co2_emissions_df.columns).intersection(tavg_df.columns).intersection(wind_df.columns).intersection(tmax_df.columns).intersection(tmin_df.columns).intersection(precip_df.columns).intersection(snow_df.columns))
columns_to_evaluate

['2016',
 '2015',
 '2010',
 '2008',
 '2011',
 '2018',
 '2019',
 '2007',
 '2014',
 'Unnamed: 0',
 '2009',
 '2012',
 '2013',
 '2017']

In [5]:
#ensure each column we are going to evaluate has the same number of values 
for col in columns_to_evaluate:
    if(not (len(vehicle_registration_df[col]) == len(total_consuption_df[col]) == len(total_population_df[col]) == len(real_gdp_df[col])== len(industy_gdp_by_state_df[col]) == len(co2_emissions_df[col]) == len(tavg_df[col])== len(wind_df[col])== len(tmax_df[col])== len(tmin_df[col])== len(precip_df[col])== len(snow_df[col]))):
        print("unequal entries for column:" + col)

In [6]:
west = ["California", "Hawaii", "Nevada", "Colorado", "Idaho", "Montana", "Utah", "Wyoming", "Oregon", "Washington", "Alaska"]
south_west = ["New Mexico", "Arizona", "Texas", "Oklahoma"]
mid_west = ["Iowa", "Kansas", "Missouri", "Nebraska", "North Dakota", "South Dakota", "Illinois", "Indiana", "Michigan", "Minnesota", "Ohio", "Wisconsin"]
south_east = ["Alabama", "Florida", "Georgia", "Mississippi", "South Carolina", "Arkansas", "Louisiana", "Delaware", "Kentucky", "Maryland", "North Carolina", "Tennessee", "Virginia", "West Virginia"]
north_east = ["New Jersey", "New York", "Pennsylvania", "Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont"]

In [7]:
west_abr = ["CA", "HI", "NV", "CO", "ID", "MT", "UT", "WY", "OR", "WA", "AK"]
southwest_abr = ["NM", "AZ", "TX", "OK"]
midwest_abr = ["IA", "KS", "MO", "NE", "ND", "SD", "IL", "IN", "MI", "MN", "OH", "WI"]
southeast_abr = ["AL", "FL", "GA", "MS", "SC", "AR", "LA", "DE", "KY", "MD", "NC", "TN", "VA", "WV"]
northeast_abr = ["NJ", "NY", "PA", "CT", "ME", "MA", "NH", "RI", "VT"]

### Data Analysis

#### This section of the notebooks creates a multiple linear regression model for a state's energy consuption.


##### In the model summary each variable is represented by the following 
- x1: Vehicle regisrations
- x2: Population
- x3: GDP per capita
- x4: Industry GDP per capita 
- x5: C02 emissions
- x6: Average tempature
- x7: Average wind speed
- x8: Maximum tempature
- x9: Minimum tempature
- x10: Total precipitation
- x11: Total snow fall

There are some other values in the summary that give us a good indication as to how well our model fits energy consuption such at the r squared value and F statistic.

In [9]:
# loop through the data frames and add each value to data_point_pairs array. 
# The data_point_pairs array will be the 
# [vehicle registration, population, GDP, Industry GDP, C02 emissions, average tempature, average wind speed, max temperature, min tempature, total precipitation, total snowfall] 
# value for each year and each state
# The total_consumption_vals will be the cooresponding energy consuption value 
# for the data point pairs item 
west_data_point_pairs = []
west_total_consumption_vals = []

southwest_data_point_pairs = []
southwest_total_consumption_vals = []

midwest_data_point_pairs = []
midwest_total_consumption_vals = []

southeast_data_point_pairs = []
southeast_total_consumption_vals = []

northeast_data_point_pairs = []
northeast_total_consumption_vals = []
for col in columns_to_evaluate:
    for i in range(0,50):
        pair = [vehicle_registration_df.iloc[i][col], total_population_df.iloc[i][col], real_gdp_df.iloc[i][col], industy_gdp_by_state_df.iloc[i][col], co2_emissions_df.iloc[i][col], tavg_df.iloc[i][col],wind_df.iloc[i][col],tmax_df.iloc[i][col],tmin_df.iloc[i][col],precip_df.iloc[i][col],snow_df.iloc[i][col]]
        if(total_consuption_df.iloc[i]['State'] in west_abr) :
            west_data_point_pairs.append(pair)
            west_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in southwest_abr) :
            southwest_data_point_pairs.append(pair)
            southwest_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in midwest_abr) :
            midwest_data_point_pairs.append(pair)
            midwest_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in southeast_abr) :
            southeast_data_point_pairs.append(pair)
            southeast_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 
        
        if(total_consuption_df.iloc[i]['State'] in northeast_abr) :
            northeast_data_point_pairs.append(pair)
            northeast_total_consumption_vals.append(total_consuption_df.iloc[i][col]) 

### Multiple linear regression model for energy consuption of states in the western region of US 
#### California, Hawaii, Nevada, Colorado, Idaho, Montana, Utah, Wyoming, Oregon, Washington, Alaska

In [10]:
X = west_data_point_pairs
y = west_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)


# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion:  [623040.32644597]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.995
Method:,Least Squares,F-statistic:,3094.0
Date:,"Sat, 04 Dec 2021",Prob (F-statistic):,4.3000000000000005e-164
Time:,17:41:32,Log-Likelihood:,-2060.8
No. Observations:,154,AIC:,4144.0
Df Residuals:,143,BIC:,4177.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0233,0.007,3.208,0.002,0.009,0.038
x2,177.1333,17.453,10.149,0.000,142.635,211.632
x3,0.3589,0.314,1.142,0.255,-0.262,0.980
x4,-6.2872,1.869,-3.364,0.001,-9.981,-2.593
x5,490.5079,322.373,1.522,0.130,-146.724,1127.740
x6,1.158e+05,3.17e+05,0.365,0.716,-5.12e+05,7.43e+05
x7,2.32e+04,5794.074,4.003,0.000,1.17e+04,3.46e+04
x8,-2.892e+04,1.59e+05,-0.182,0.856,-3.43e+05,2.85e+05
x9,-1.108e+05,1.58e+05,-0.699,0.486,-4.24e+05,2.02e+05

0,1,2,3
Omnibus:,15.764,Durbin-Watson:,2.43
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.331
Skew:,0.769,Prob(JB):,0.000172
Kurtosis:,3.581,Cond. No.,302000000.0


### Multiple linear regression model for energy consuption of states in the south western region of US 
#### New Mexico, Arizona, Texas, Oklahoma

In [12]:
X = southwest_data_point_pairs
y = southwest_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion:  [-326940.97201463]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.998
Model:,OLS,Adj. R-squared (uncentered):,0.997
Method:,Least Squares,F-statistic:,1991.0
Date:,"Sat, 04 Dec 2021",Prob (F-statistic):,1.2700000000000002e-56
Time:,17:43:14,Log-Likelihood:,-781.19
No. Observations:,56,AIC:,1584.0
Df Residuals:,45,BIC:,1607.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0272,0.020,1.340,0.187,-0.014,0.068
x2,43.0576,83.430,0.516,0.608,-124.980,211.095
x3,6.8649,1.381,4.971,0.000,4.083,9.647
x4,-0.6825,2.060,-0.331,0.742,-4.831,3.466
x5,1417.3311,477.335,2.969,0.005,455.929,2378.733
x6,4.244e+06,1.45e+06,2.926,0.005,1.32e+06,7.16e+06
x7,-5.832e+04,3.68e+04,-1.587,0.120,-1.32e+05,1.57e+04
x8,-2.194e+06,7.2e+05,-3.046,0.004,-3.65e+06,-7.43e+05
x9,-1.996e+06,7.36e+05,-2.711,0.009,-3.48e+06,-5.13e+05

0,1,2,3
Omnibus:,2.154,Durbin-Watson:,1.935
Prob(Omnibus):,0.341,Jarque-Bera (JB):,1.421
Skew:,0.364,Prob(JB):,0.491
Kurtosis:,3.278,Cond. No.,487000000.0


### Multiple linear regression model for energy consuption of states in the mid western region of US 
#### Iowa, Kansas, Missouri, Nebraska, North Dakota, South Dakota, Illinois, Indiana, Michigan, Minnesota, Ohio, Wisconsin

In [13]:
X = midwest_data_point_pairs
y = midwest_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121
# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion:  [275943.27785106]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.991
Model:,OLS,Adj. R-squared (uncentered):,0.991
Method:,Least Squares,F-statistic:,1644.0
Date:,"Sat, 04 Dec 2021",Prob (F-statistic):,5.46e-156
Time:,17:43:23,Log-Likelihood:,-2290.2
No. Observations:,168,AIC:,4602.0
Df Residuals:,157,BIC:,4637.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0016,0.012,0.130,0.897,-0.022,0.026
x2,310.9513,27.709,11.222,0.000,256.220,365.682
x3,0.2821,0.467,0.604,0.547,-0.640,1.205
x4,-13.7263,4.434,-3.096,0.002,-22.484,-4.968
x5,-179.5936,491.987,-0.365,0.716,-1151.360,792.173
x6,4.604e+05,4.49e+05,1.025,0.307,-4.27e+05,1.35e+06
x7,-1.705e+04,7536.953,-2.262,0.025,-3.19e+04,-2160.025
x8,-2.365e+05,2.25e+05,-1.052,0.294,-6.8e+05,2.07e+05
x9,-2.102e+05,2.25e+05,-0.936,0.351,-6.54e+05,2.34e+05

0,1,2,3
Omnibus:,31.349,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44.347
Skew:,1.052,Prob(JB):,2.35e-10
Kurtosis:,4.38,Cond. No.,160000000.0


### Multiple linear regression model for energy consuption of states in the south eastern region of US 
#### Alabama, Florida, Georgia, Mississippi, South Carolina, Arkansas, Louisiana, Delaware, Kentucky, Maryland, North Carolina, Tennessee, Virginia, West Virginia

In [14]:
X = southeast_data_point_pairs
y = southeast_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion:  [1136557.42067592]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.957
Model:,OLS,Adj. R-squared (uncentered):,0.954
Method:,Least Squares,F-statistic:,373.5
Date:,"Sat, 04 Dec 2021",Prob (F-statistic):,6.05e-120
Time:,17:43:34,Log-Likelihood:,-2837.2
No. Observations:,196,AIC:,5696.0
Df Residuals:,185,BIC:,5732.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0430,0.018,-2.427,0.016,-0.078,-0.008
x2,-46.0848,53.964,-0.854,0.394,-152.548,60.379
x3,5.6108,1.119,5.016,0.000,3.404,7.818
x4,-21.4662,6.398,-3.355,0.001,-34.090,-8.843
x5,8864.5938,768.363,11.537,0.000,7348.714,1.04e+04
x6,3.049e+04,6.21e+05,0.049,0.961,-1.19e+06,1.26e+06
x7,1.609e+04,1.23e+04,1.313,0.191,-8086.631,4.03e+04
x8,9091.1942,3.13e+05,0.029,0.977,-6.07e+05,6.26e+05
x9,-6.966e+04,3.08e+05,-0.227,0.821,-6.76e+05,5.37e+05

0,1,2,3
Omnibus:,82.689,Durbin-Watson:,1.872
Prob(Omnibus):,0.0,Jarque-Bera (JB):,357.88
Skew:,1.614,Prob(JB):,1.94e-78
Kurtosis:,8.779,Cond. No.,126000000.0


### Multiple linear regression model for energy consuption of states in the north eastern region of US 
#### New Jersey, New York, Pennsylvania, Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont

In [15]:
X = northeast_data_point_pairs
y = northeast_total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

# predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
# Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
# Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121,  6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion: ", predictions )

model = sm.OLS(y, X).fit()
model.summary()

Predicted energy consumpion:  [198725.24000714]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.992
Model:,OLS,Adj. R-squared (uncentered):,0.991
Method:,Least Squares,F-statistic:,1270.0
Date:,"Sat, 04 Dec 2021",Prob (F-statistic):,1.62e-114
Time:,17:43:45,Log-Likelihood:,-1701.8
No. Observations:,126,AIC:,3426.0
Df Residuals:,115,BIC:,3457.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0135,0.012,1.122,0.264,-0.010,0.037
x2,417.1920,34.891,11.957,0.000,348.080,486.304
x3,-2.9518,0.516,-5.726,0.000,-3.973,-1.931
x4,0.3347,2.389,0.140,0.889,-4.397,5.066
x5,-148.0390,650.365,-0.228,0.820,-1436.287,1140.209
x6,5.243e+05,3.9e+05,1.344,0.182,-2.49e+05,1.3e+06
x7,1876.0353,5865.678,0.320,0.750,-9742.743,1.35e+04
x8,-2.461e+05,1.94e+05,-1.266,0.208,-6.31e+05,1.39e+05
x9,-2.784e+05,1.96e+05,-1.418,0.159,-6.67e+05,1.11e+05

0,1,2,3
Omnibus:,10.875,Durbin-Watson:,1.965
Prob(Omnibus):,0.004,Jarque-Bera (JB):,27.702
Skew:,0.086,Prob(JB):,9.65e-07
Kurtosis:,5.291,Cond. No.,143000000.0


### Conclusion

#### This section of the notebooks discusses the results