In [25]:
import pandas as pd
import os
import statsmodels.api as sm
from sklearn import linear_model
import numpy as np

## This notebooks explores the relationship between a state's number of vehicle registrations, population, GDP per capita, GDP per capita by industry,  C02 emissions, average yearly tempature, average yearly windspeed, minimum yearly tempature, maximim yearly tempature, total yearly precipitation , and total yearly snowfall on it's energy consumption. 

### The goal is to model a US state's energy consuption by using the data listed above. With this model we can make energy consuption predictions and understand what leads to high energy consuption.

### The contents of the notebook include
- #### Data Gathering
    - read in the dataframes that have been cleaned by data_gathering_and_cleaning notebook
- #### Data analysis 
    - create a multiple linear regression model for energy consuption
- #### Conclusion
    - Discuss what we discovered and draw conclusions
    
Note: If there are no files in the Data/cleaned diretory, you will need to run the 'data_gathering_and_cleaning" notebook to clwan and write out the files to that directory.

### Data Gathering
#### This section of the notebooks reads in the data files and stores them im pandas dataframes.
The dataframes frames in this section all have columns of represting years ranging from [1967-2020] and rows for each state. 

In [26]:
csv_path = os.path.join(os.getcwd(), "data/cleaned/csv")
excel_path = os.path.join(os.getcwd(), "data/cleaned/excel")

In [27]:
#Read in all datasets here 
vehicle_registration_df = pd.read_csv(os.path.join(csv_path, "vehicle_registrations_by_state.csv"))
energy_consumption_per_real_gdp_df = pd.read_csv(os.path.join(csv_path, "energy_consumption_per_real_gdp.csv"))
current_dollar_gdp_df = pd.read_csv(os.path.join(csv_path, "Current_dollar_GDP.csv")) #in millions
total_consuption_df = pd.read_csv(os.path.join(csv_path, "total_consuption.csv")) #in billion Btu
industy_gdp_by_state_df = pd.read_csv(os.path.join(csv_path, "industy_gdp_by_state.csv"))
total_population_df = pd.read_csv(os.path.join(csv_path, "total_population.csv"))
real_gdp_df = pd.read_csv(os.path.join(csv_path, "real_GDP.csv")) #in millions
co2_emissions_df = pd.read_excel(os.path.join(excel_path, "co2_emissions.xlsx"))
tavg_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TAVG.csv"))
wind_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "DYHF.csv"))
tmax_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TMAX.csv"))
tmin_df =  pd.read_csv(os.path.join(csv_path + '/NOA', "TMIN.csv"))
precip_df = pd.read_csv(os.path.join(csv_path + '/NOA', "PRCP.csv"))
snow_df = pd.read_csv(os.path.join(csv_path + '/NOA', "SNOW.csv"))

In [33]:
co2_emissions_df.columns

Index(['Unnamed: 0', 'State', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020'],
      dtype='object')

In [28]:
#Use the columns that are in each dataframe after columns with empty values have been dropped. 
columns_to_evaluate = list(set(vehicle_registration_df.columns).intersection(total_population_df.columns).intersection(total_consuption_df.columns).intersection(real_gdp_df.columns).intersection(industy_gdp_by_state_df.columns).intersection(co2_emissions_df.columns).intersection(tavg_df.columns).intersection(wind_df.columns).intersection(tmax_df.columns).intersection(tmin_df.columns).intersection(precip_df.columns).intersection(snow_df.columns))
columns_to_evaluate

['2013',
 '2017',
 '2019',
 '2012',
 '2011',
 '2015',
 '2016',
 '2009',
 '2008',
 '2007',
 '2014',
 '2018',
 '2010',
 'Unnamed: 0']

In [29]:
#ensure each column we are going to evaluate has the same number of values 
for col in columns_to_evaluate:
    if(not (len(vehicle_registration_df[col]) == len(total_consuption_df[col]) == len(total_population_df[col]) == len(real_gdp_df[col])== len(industy_gdp_by_state_df[col]) == len(co2_emissions_df[col]) == len(tavg_df[col])== len(wind_df[col])== len(tmax_df[col])== len(tmin_df[col])== len(precip_df[col])== len(snow_df[col]))):
        print("unequal entries for column:" + col)

### Data Analysis

#### This section of the notebooks creates a multiple linear regression model for a state's energy consuption.


##### In the model summary each variable is represented by the following 
- x1: Vehicle regisrations
- x2: Population
- x3: GDP per capita
- x4: Industry GDP per capita 
- x5: C02 emissions
- x6: Average tempature
- x7: Average wind speed
- x8: Maximum tempature
- x9: Minimum tempature
- x10: Total precipitation
- x11: Total snow fall

There are some other values in the summary that give us a good indication as to how well our model fits energy consuption such at the r squared value and F statistic.

In [30]:
# loop through the data frames and add each value to data_point_pairs array. 
# The data_point_pairs array will be the 
# [vehicle registration, population, GDP, Industry GDP, C02 emissions, average tempature, average wind speed, max temperature, min tempature, total precipitation, total snowfall] 
# value for each year and each state
# The total_consumption_vals will be the cooresponding energy consuption value 
# for the data point pairs item 
data_point_pairs = []
total_consumption_vals = []
for col in columns_to_evaluate:
    for i in range(0,50):
        pair = [vehicle_registration_df.iloc[i][col], total_population_df.iloc[i][col], real_gdp_df.iloc[i][col], industy_gdp_by_state_df.iloc[i][col], co2_emissions_df.iloc[i][col],tavg_df.iloc[i][col],wind_df.iloc[i][col],tmax_df.iloc[i][col],tmin_df.iloc[i][col],precip_df.iloc[i][col],snow_df.iloc[i][col]]
        data_point_pairs.append(pair)
        
        total_consumption_vals.append(total_consuption_df.iloc[i][col]) 

In [31]:
print("vehicle registration:" , data_point_pairs[0][0])
print("population: ", data_point_pairs[0][1])
print("GDP: ", data_point_pairs[0][2])
print("Industry GDP: ", data_point_pairs[0][3])
print("C02 emissions: ", data_point_pairs[0][4])
print("Average tempature:" ,data_point_pairs[0][5])
print("Average Wind Speed:" ,data_point_pairs[0][6])
print("Maximim tempature:" ,data_point_pairs[0][7])
print("Mimimum tempature:" ,data_point_pairs[0][8])
print("Total Precipitation:" ,data_point_pairs[0][9])
print("Total snowfall:" ,data_point_pairs[0][10])
print("total energy consuption:" ,total_consumption_vals[0])


vehicle registration: 4787219.0
population:  738.0
GDP:  54748.0
Industry GDP:  11241.679347826086
C02 emissions:  121.1630059889289
Average tempature: 6.774734157214605
Average Wind Speed: 2.545909090909091
Maximim tempature: 14.073909594750376
Mimimum tempature: -0.4421591745467444
Total Precipitation: 46.89005639838973
Total snowfall: 190.3120019711779
total energy consuption: 597975.0


In [32]:
X = data_point_pairs
y = total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 4610845 , population =699 (10,000), GDP = 55911, 
#Industry GDP = 9717, C02 emissions = 121, Average tempature = 6.7, Average Wind Speed = 2.5
#Maximim tempature = 14.07, Mimimum tempature = -0.44, Total Precipitation = 47, Total snowfall: 190 
predictions = lm.predict([[4610845, 699, 55911, 9717, 121, 6.7, 2.5, 14.07, -0.44, 47, 190]])
print("Predicted energy consumpion:", predictions )

model = sm.OLS(y, X).fit()
model.summary()



Predicted energy consumpion: [853392.04040656]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.933
Model:,OLS,Adj. R-squared (uncentered):,0.932
Method:,Least Squares,F-statistic:,872.5
Date:,"Sun, 05 Dec 2021",Prob (F-statistic):,0.0
Time:,18:21:35,Log-Likelihood:,-10425.0
No. Observations:,700,AIC:,20870.0
Df Residuals:,689,BIC:,20920.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0597,0.013,-4.524,0.000,-0.086,-0.034
x2,205.9536,24.643,8.358,0.000,157.570,254.338
x3,0.3241,0.399,0.813,0.417,-0.459,1.107
x4,-8.2697,2.488,-3.323,0.001,-13.155,-3.384
x5,9260.8033,432.453,21.415,0.000,8411.720,1.01e+04
x6,2.281e+05,5.88e+05,0.388,0.698,-9.27e+05,1.38e+06
x7,-1.4e+04,8625.538,-1.623,0.105,-3.09e+04,2936.245
x8,-1.177e+05,2.95e+05,-0.399,0.690,-6.96e+05,4.61e+05
x9,-1.1e+05,2.93e+05,-0.375,0.708,-6.86e+05,4.66e+05

0,1,2,3
Omnibus:,706.724,Durbin-Watson:,1.763
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75777.324
Skew:,4.277,Prob(JB):,0.0
Kurtosis:,53.249,Cond. No.,192000000.0


### Conclusion

#### This section of the notebooks discusses the results