In [1]:
import pandas as pd
import os
import statsmodels.api as sm
from sklearn import linear_model
import numpy as np

### Data Gathering
#### This section of the notebooks reads in the data files and stores them im pandas dataframes.
The dataframes frames in this section all have colums of [1960 - 2019] and rows for each state execpt for industy_gdp_by_state_df which goes from [1997-2020] 

In [2]:
csv_path = os.path.join(os.getcwd(), "data/csv")

In [3]:
#Read in all datasets here 

vehicle_registration_df = pd.read_csv(os.path.join(csv_path, "vehicle_registrations_by_state.csv"))
energy_consumption_per_real_gdp_df = pd.read_csv(os.path.join(csv_path, "energy_consumption_per_real_gdp.csv"))
current_dollar_gdp_df = pd.read_csv(os.path.join(csv_path, "Current_dollar_GDP.csv")) #in millions
total_consuption_df = pd.read_csv(os.path.join(csv_path, "total_consuption.csv")) #in billion Btu
industy_gdp_by_state_df = pd.read_csv(os.path.join(csv_path, "industy_gdp_by_state.csv"))
total_population_df = pd.read_csv(os.path.join(csv_path, "total_population.csv"))
real_gdp_df = pd.read_csv(os.path.join(csv_path, "real_GDP.csv")) #in millions

### Data Cleaning

#### This section of the notebook cleans the data frames. 

We start by dropping all the unneeded columns so that each data frame has the same colums. Then we drop any columns that have missing values. This only leaves us with a few columns, so it might be a better idea in the future to replace empty values with the mean value for that year or something similar. 

This section is only evaluting vehicle registration, population and GDP against energy consuption as a proof of concept. We will need to clean and add all the other dataframes to this model.  

In [4]:
unnamed_columns_to_drop = ['Unnamed: 91', 'Unnamed: 92', 'Unnamed: 93', 'Unnamed: 94',
       'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98',
       'Unnamed: 99', 'Unnamed: 100','Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65',
       'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69',
       'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73',
       'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77',
       'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81',
       'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85',
       'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89',
       'Unnamed: 90', 'Unnamed: 61']

In [5]:
def clean_dataframe(df):
    unnamed_to_drop = list(set(df.columns).intersection(unnamed_columns_to_drop))
    df = df.drop(columns = unnamed_to_drop)
    
    null_values_allowed_before_column_is_dropped = 40
    columns_to_drop = []
    
    for col in df.columns[1:]:
        if(df[col].isna().sum() > null_values_allowed_before_column_is_dropped):
            columns_to_drop.append(col)
        else:
            df[col].fillna(value=df[col].mean(), inplace=True)
    df = df.drop( columns = columns_to_drop)
    return df

In [6]:
vehicle_registration_df.drop(index = [0,1,2,3,4,5,6,7,8,9,61,62,64,63], inplace = True)
vehicle_registration_df = clean_dataframe(vehicle_registration_df)
vehicle_registration_df.head()
vehicle_registration_columns = vehicle_registration_df.columns

In [7]:
total_population_df.drop(index = [51], inplace = True)
total_population_df = clean_dataframe(total_population_df)
total_population_columns = total_population_df.columns

In [8]:
total_consuption_df.drop(index = [51], inplace = True)
total_consuption_df = clean_dataframe(total_consuption_df)
total_consuption_df_columns = total_consuption_df.columns

In [9]:
real_gdp_df.drop(index = [51], inplace = True)
real_gdp_df = clean_dataframe(real_gdp_df)
real_gdp_df_columns = real_gdp_df.columns

In [10]:
#Use the columns that are in each dataframe after columns with empty values have been dropped. 
columns_to_evaluate = list(set(vehicle_registration_columns).intersection(total_population_columns).intersection(total_consuption_df_columns).intersection(real_gdp_df_columns))
columns_to_evaluate

['2014',
 '2016',
 '2009',
 '2019',
 '1998',
 '2008',
 '2017',
 '2010',
 '2015',
 '2018',
 '2013',
 '2012',
 '2011',
 '2007']

In [11]:
#ensure each column we are going to evaluate has the same number of values 
for col in columns_to_evaluate:
    if(not (len(vehicle_registration_df[col]) == len(total_consuption_df[col]) == len(total_population_df[col]) == len(real_gdp_df[col]))):
        print("unequal entries for column:" + col)

In [12]:
# loop through the data frames and add each value to data_point_pairs array. 
# The data_point_pairs array will be the [vehicle registration, population, GDP] value for each year and each state
# The total_consumption_vals will be the cooresponding energy consuption value 
# for the [vehicle registration, population, GDP] data point
data_point_pairs = []
total_consumption_vals = []
for col in columns_to_evaluate:
    for i in range(0,51):
        pair = [vehicle_registration_df.iloc[i][col], total_population_df.iloc[i][col], real_gdp_df.iloc[i][col]]
        data_point_pairs.append(pair)
        
        total_consumption_vals.append(total_consuption_df.iloc[i][col]) 

In [13]:
print("vehicle registration:" , data_point_pairs[0][0])
print("population: ", data_point_pairs[0][1])
print("GDP: ", data_point_pairs[0][2])
print("total energy consuption:" ,total_consumption_vals[0])

vehicle registration: 5366844.0
population:  737
GDP:  53481
total energy consuption: 594764


### Data Analysis

#### This section of the notebooks creates a linear regression moel for energy consuption.

Right now, the model is only using population, vehicle registration and GDP to predict energy comnsuption. We will need to include the other dataframes.

In the model summary, x1 represents vehicle regisration, x2 represents population and x3 represents GDP. There are some other values in the summary that give us a good indication as to how well our model fits energy consuption such at the r squared value and F statistic.

In [14]:
# A potential library we can use for regression analysis 

X = data_point_pairs
y = total_consumption_vals
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

#predict energy consuption for vehicle registration = 400000 , population =  800 (million), GDP = 45828.2
predictions = lm.predict([[400000, 800, 45828.2]])
print("Predicted energy consumpion for \nvehicle registration = 400000 , population =  800 (million), GDP = 45828.2\n", predictions )

model = sm.OLS(y, X).fit()
model.summary()



Predicted energy consumpion for 
vehicle registration = 400000 , population =  800 (million), GDP = 45828.2
 [429774.22121854]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.879
Model:,OLS,Adj. R-squared (uncentered):,0.879
Method:,Least Squares,F-statistic:,1729.0
Date:,"Thu, 04 Nov 2021",Prob (F-statistic):,0.0
Time:,10:56:29,Log-Likelihood:,-10861.0
No. Observations:,714,AIC:,21730.0
Df Residuals:,711,BIC:,21740.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0163,0.009,1.787,0.074,-0.002,0.034
x2,445.8743,24.748,18.017,0.000,397.286,494.462
x3,-3.1190,0.441,-7.074,0.000,-3.985,-2.253

0,1,2,3
Omnibus:,426.823,Durbin-Watson:,1.868
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6843.147
Skew:,2.361,Prob(JB):,0.0
Kurtosis:,17.412,Cond. No.,4960.0
