In [1]:
import pandas as pd
import os
import statsmodels.api as sm
from sklearn import linear_model
import numpy as np

## This notebooks explores the relationship between a state's number of vehicle registrations, population, GDP, GDP by industry,  and C02 emissions on it's energy consumption. 

### The goal is to model a US state's energy consuption by using the data listed above. With this model we can make energy consuption predictions and understand what leads to high energy consuption.

### The contents of the notebook include
- #### Data Gathering
    - read in the dataframes that have been cleaned by data_gathering_and_cleaning notebook
- #### Data analysis 
    - create a multiple linear regression model for energy consuption
- #### Conclusion
    - Discuss what we discovered and draw conclusions
    
Note: If there are no files in the Data/cleaned diretory, you will need to run the 'data_gathering_and_cleaning" notebook to clwan and write out the files to that directory.

### Data Gathering
#### This section of the notebooks reads in the data files and stores them im pandas dataframes.
The dataframes frames in this section all have columns of represting years ranging from [1967-2020] and rows for each state. 

In [2]:
csv_path = os.path.join(os.getcwd(), "data/cleaned/csv")
excel_path = os.path.join(os.getcwd(), "data/cleaned/excel")

In [3]:
#Read in all datasets here 

vehicle_registration_df = pd.read_csv(os.path.join(csv_path, "vehicle_registrations_by_state.csv"))
energy_consumption_per_real_gdp_df = pd.read_csv(os.path.join(csv_path, "energy_consumption_per_real_gdp.csv"))
current_dollar_gdp_df = pd.read_csv(os.path.join(csv_path, "Current_dollar_GDP.csv")) #in millions
total_consuption_df = pd.read_csv(os.path.join(csv_path, "total_consuption.csv")) #in billion Btu
industy_gdp_by_state_df = pd.read_csv(os.path.join(csv_path, "industy_gdp_by_state.csv"))
total_population_df = pd.read_csv(os.path.join(csv_path, "total_population.csv"))
real_gdp_df = pd.read_csv(os.path.join(csv_path, "real_GDP.csv")) #in millions
co2_emissions_df = pd.read_excel(os.path.join(excel_path, "co2_emissions.xlsx"))


In [6]:
#Use the columns that are in each dataframe after columns with empty values have been dropped. 
columns_to_evaluate = list(set(vehicle_registration_df.columns).intersection(total_population_df.columns).intersection(total_consuption_df.columns).intersection(real_gdp_df.columns).intersection(industy_gdp_by_state_df.columns).intersection(co2_emissions_df.columns))
columns_to_evaluate

['Unnamed: 0',
 '2015',
 '2019',
 '2016',
 '2013',
 '2011',
 '2010',
 '2007',
 '2017',
 '2008',
 '2012',
 '2018',
 '2009',
 '2014']

In [7]:
#ensure each column we are going to evaluate has the same number of values 
for col in columns_to_evaluate:
    if(not (len(vehicle_registration_df[col]) == len(total_consuption_df[col]) == len(total_population_df[col]) == len(real_gdp_df[col])== len(industy_gdp_by_state_df[col]) == len(co2_emissions_df[col]))):
        print("unequal entries for column:" + col)

### Data Analysis

#### This section of the notebooks creates a multiple linear regression model for a state's energy consuption.

We will need to add the climate data.

In the model summary, x1 represents vehicle regisration, x2 represents population and x3 represents GDP, x4 represents Industry GDP, x5 represents C02 emissions. There are some other values in the summary that give us a good indication as to how well our model fits energy consuption such at the r squared value and F statistic.

In [8]:
# loop through the data frames and add each value to data_point_pairs array. 
# The data_point_pairs array will be the [vehicle registration, population, GDP, Industry GDP] value for each year and each state
# The total_consumption_vals will be the cooresponding energy consuption value 
# for the [vehicle registration, population, GDP, Industry GDP, C02 emissions] data point
data_point_pairs = []
total_consumption_vals = []
for col in columns_to_evaluate:
    for i in range(0,50):
        pair = [vehicle_registration_df.iloc[i][col], total_population_df.iloc[i][col], real_gdp_df.iloc[i][col], industy_gdp_by_state_df.iloc[i][col], co2_emissions_df.iloc[i][col]]
        data_point_pairs.append(pair)
        
        total_consumption_vals.append(total_consuption_df.iloc[i][col]) 

In [9]:
print("vehicle registration:" , data_point_pairs[0][0])
print("population: ", data_point_pairs[0][1])
print("GDP: ", data_point_pairs[0][2])
print("Industry GDP: ", data_point_pairs[0][3])
print("C02 emissions: ", data_point_pairs[0][4])
print("total energy consuption:" ,total_consumption_vals[0])

vehicle registration: 10
population:  0
GDP:  0
Industry GDP:  0
C02 emissions:  4
total energy consuption: 0


In [10]:
# A potential library we can use for regression analysis 
# normalize population

X = data_point_pairs
y = total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121
predictions = lm.predict([[4610845, 699, 55911, 9717, 121]])
print("Predicted energy consumpion for \nvehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 \n", predictions )

model = sm.OLS(y, X).fit()
model.summary()



Predicted energy consumpion for 
vehicle registration = 4610845 , population = 699 (10,000), GDP = 55911, Industry GDP = 9717, C02 emissions = 121 
 [975165.04327189]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.93
Model:,OLS,Adj. R-squared (uncentered):,0.93
Method:,Least Squares,F-statistic:,1850.0
Date:,"Tue, 16 Nov 2021",Prob (F-statistic):,0.0
Time:,12:17:10,Log-Likelihood:,-10440.0
No. Observations:,700,AIC:,20890.0
Df Residuals:,695,BIC:,20910.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0548,0.013,-4.146,0.000,-0.081,-0.029
x2,210.7417,23.754,8.872,0.000,164.105,257.379
x3,0.3225,0.387,0.834,0.405,-0.437,1.082
x4,-8.3741,2.479,-3.378,0.001,-13.241,-3.507
x5,9335.0498,427.329,21.845,0.000,8496.038,1.02e+04

0,1,2,3
Omnibus:,684.076,Durbin-Watson:,1.746
Prob(Omnibus):,0.0,Jarque-Bera (JB):,67045.337
Skew:,4.072,Prob(JB):,0.0
Kurtosis:,50.248,Cond. No.,112000.0


### Conclusion

#### This section of the notebooks discusses the results