In [1]:
# Data Wrangling
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

In [2]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Model Creation
from prophet import Prophet
from xgboost import XGBRegressor
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score

In [None]:
# Read in data 
sunoco = pd.read_csv('Sunoco Project Data  - Combined Variables Monthly Data.csv', index_col = 'Month')

# Removing data that will be replaced when importing weekly data
columns_to_remove = ['Automobile Sales (In millions)']
for cname in columns_to_remove:
    sunoco = sunoco.drop(cname, axis=1)

# COLUMN Categorizations
# All of the known columns from the data set
all_column_names = list(sunoco.columns)
# Columns which can be interpolated linearly to get weekly data from monthly
linear_columns = ['U.S. Demographics (in millions)']
# Columns which we have raw weekly data for and so we don't need to interpolate at all
weekly_data_columns = [
    'Regular Gasoline Prices (dollars per gallon)',
    'Crude oil prices (U.S.) (Dollars per barrel)',
    'Federal Funds Effective Rate',
    'Implied Gasoline Demand',
    'US Gross inputs to refineries (in thousands)',
    'US Percent Utilization of Refinery Operable Capacity',
    'Automobile Sales (In millions)',
    'Total Ridership (in thousands)',
    'Weekly Jobless Claims'
]
# Columns which we don't have weekly data for or cannot interpolate (e.g. because we don't have the data or they are index fields)
other_columns = list((set(all_column_names) - set(weekly_data_columns)) - set(linear_columns))



sunoco.index = pd.to_datetime(sunoco.index,utc='True')
sunoco['Date'] = sunoco.index
# print(sunoco.columns)
# sunoco = sunoco.loc[sunoco.Date >= '2022-01-01']
sunoco = sunoco.tz_localize(None)

sunoco_no_interpolation = sunoco.copy()
# print(sunoco_no_interpolation.head())
sunoco_no_interpolation.index = pd.to_datetime(sunoco_no_interpolation.index,utc='True')
sunoco_no_interpolation['Date'] = sunoco_no_interpolation.index
sunoco_no_interpolation.drop(columns=linear_columns, axis=1, inplace=True)
sunoco_no_interpolation = sunoco_no_interpolation.resample('D').ffill().resample('W').ffill()
# sunoco_no_interpolation.index = pd.to_datetime(sunoco_no_interpolation.index,utc='True')
# sunoco_no_interpolation['Date'] = sunoco_no_interpolation.index
# sunoco_no_interpolation = pd.DataFrame(sunoco_no_interpolation.resample('W'))


sunoco_no_interpolation[0:10]

In [None]:

sunoco_weekly_data = pd.read_csv('Sunoco Project Data  - Combined Variables Weekly Data.csv', index_col = 'Week', )[weekly_data_columns]

sunoco_weekly_data.head()
sunoco_weekly_data.index = pd.to_datetime(sunoco_weekly_data.index,utc='True')
sunoco_weekly_data['Week'] = sunoco_weekly_data.index
sunoco_weekly_data = sunoco_weekly_data.resample('D').ffill().resample('W').ffill()


In [None]:
print(sunoco_weekly_data.columns)

In [None]:
#Change to weekly - upsampling
sunoco_linear = sunoco[linear_columns]
sunoco_linear=sunoco_linear.resample('D')
sunoco_linear = sunoco_linear.interpolate(method='linear', )


sunoco_linear = sunoco_linear.resample('W').interpolate(method='linear', )
sunoco_linear.index = pd.to_datetime(sunoco_linear.index,utc='True')
sunoco_linear['Date'] = sunoco_linear.index
sunoco_linear

In [None]:
sunoco_joined = sunoco_linear.join(sunoco_no_interpolation[other_columns], on='Date', how='inner', lsuffix = 'Month', rsuffix = 'Date')
sunoco_joined

In [None]:
# TODO figure out why 57 rows are not being joined when using a 'left' join
sunoco_weekly_joined = sunoco_joined.join(sunoco_weekly_data, on = 'Date', how = 'inner', lsuffix = 'Week', rsuffix = 'Date')
sunoco_weekly_joined.set_index('Week', inplace=True)
sunoco_weekly_joined.drop(columns=['Date'], axis=1, inplace=True)
sunoco_weekly_joined.sort_values('Week', inplace=True)
sunoco_weekly_joined = sunoco_weekly_joined.reset_index('Week')
sunoco_weekly_joined.Week = pd.to_datetime(sunoco_weekly_joined.Week, unit='D')
sunoco_weekly_joined

In [None]:
sunoco_weekly_joined['Week'] = sunoco_weekly_joined['Week'].dt.tz_localize(None)
sunoco_weekly_joined

In [None]:
for col in sunoco_weekly_joined.columns:
    print(col)

In [None]:
def convert_percent_str(val):
    try:
        return val.str.rstrip("%").astype(float)/100
    except AttributeError:
        return val

def convert_dollar_str(val):
    try:
        return val.str.lstrip(" $").astype(float)
    except AttributeError:
        return val
# and then add the following below the "other_columns" assignment in cell 5:

percentage_columns = [
    'IE (25th Percentile)',
    'IE (75th Percentile)',
    'IE (Median)',
    'HME (25th Percentile)',
    'HME (75th percentile)',
    'HME (Median)',
    'Labor costs (Govt) (in percents)',
    'Labor costs (Private) (in percents)',
]

dollar_columns = [
    'Employee Earnings'
]
sunoco_weekly_joined[percentage_columns] = sunoco_weekly_joined[percentage_columns].apply(convert_percent_str)
sunoco_weekly_joined[dollar_columns] = sunoco_weekly_joined[dollar_columns].apply(convert_dollar_str)


In [None]:
# Data Cleaning - handle missing values in weekly joined dataset 
missing_backward_vals_cols = ['Employee Earnings', 'HME (25th Percentile)','HME (Median)', 'HME (75th percentile)', 'IE (25th Percentile)', 'IE (Median)', 'IE (75th Percentile)', 'EV Regulations (total)', 'BEV Sales (U.S.)', 'US Dollar Index', 'Consumer Credit Data', 'Consumer loans (Dollars)', 'Labor costs (Private) (in percents)', 'Labor costs (Govt) (in percents)']
missing_forward_vals_cols = ['BEV Sales (U.S.)', 'Labor costs (Private) (in percents)', 'Labor costs (Govt) (in percents)', 'Employee Earnings', 'U.S. Crude Oil and Natural Gas Rotary Rigs in Operation (Count)', 'US GDP (in billions)', 'Consumer Credit Data', 'EV Regulations (total)']
# Use bfill to fill missing values - what I found on the internet
for col in missing_backward_vals_cols:
    sunoco_weekly_joined[col] = sunoco_weekly_joined[col].bfill()
# Use ffill to fill missing values left
for col in missing_forward_vals_cols:
    sunoco_weekly_joined[col] = sunoco_weekly_joined[col].ffill()
sunoco_weekly_joined['Week'] = pd.to_datetime(sunoco_weekly_joined['Week'])
sunoco_weekly_joined = sunoco_weekly_joined.set_index('Week')
sunoco_weekly_joined

In [None]:
#Further data cleaning:
#US Gross inputs to refineries (in thousands)
print(f"Nas: {sunoco_weekly_joined['US Gross inputs to refineries (in thousands)'].isna().sum()}")
mean_3 = sunoco_weekly_joined['US Gross inputs to refineries (in thousands)'].mean()
sunoco_weekly_joined['US Gross inputs to refineries (in thousands)'] = sunoco_weekly_joined['US Gross inputs to refineries (in thousands)'].fillna(mean_3)
print(f"Nas after imputing mean: {sunoco_weekly_joined['US Gross inputs to refineries (in thousands)'].isna().sum()}")

#US Percent Utilization of Refinery Operable Capacity
print(sunoco_weekly_joined['US Percent Utilization of Refinery Operable Capacity'].dtype)
#Deal with NAs
print(sunoco_weekly_joined['US Percent Utilization of Refinery Operable Capacity'].isna().sum())
mean_4 = sunoco_weekly_joined['US Percent Utilization of Refinery Operable Capacity'].mean()
sunoco_weekly_joined['US Percent Utilization of Refinery Operable Capacity'] = sunoco_weekly_joined['US Percent Utilization of Refinery Operable Capacity'].fillna(mean_4)
print(sunoco_weekly_joined['US Percent Utilization of Refinery Operable Capacity'].isna().sum())


#Automobile Sales (In millions)
print(sunoco_weekly_joined['Automobile Sales (In millions)'].dtype)
#Check nas 
print(sunoco_weekly_joined['Automobile Sales (In millions)'].isna().sum())
#No nas 

#Total Ridership (in thousands)
print(sunoco_weekly_joined['Total Ridership (in thousands)'].dtype)
#Check nas 
print(sunoco_weekly_joined['Total Ridership (in thousands)'].isna().sum())
#No nas 
#Nas from April 23 to Sept 23 - will impute using mean of 2023
ridership_2023 = sunoco_weekly_joined[sunoco_weekly_joined.index.year == 2023]
mean_2023 = ridership_2023['Total Ridership (in thousands)'].mean() 
sunoco_weekly_joined['Total Ridership (in thousands)'] = sunoco_weekly_joined['Total Ridership (in thousands)'].fillna(mean_2023)
print(sunoco_weekly_joined['Total Ridership (in thousands)'].isna().sum())

#Weekly Jobless Claims
print(sunoco_weekly_joined['Weekly Jobless Claims'].dtype)
#Check nas 
print(sunoco_weekly_joined['Weekly Jobless Claims'].isna().sum())
#No nas

In [None]:
# 7 day moving average 
sunoco_weekly_joined['Moving Average'] = sunoco_weekly_joined['Implied Gasoline Demand'].rolling(7).mean()
sunoco_weekly_joined.head()

In [None]:
#Use prophet to predict implied gasoline demand
def index_to_column(data):
    data = data.reset_index()
    data['Week'] = pd.to_datetime(data['Week'])
    data = data.sort_values('Week')
  
    
    data = data.rename(columns={'Week': 'ds', 'Implied Gasoline Demand': 'y'})
    return data

In [None]:
sunoco_weekly_joined_train, sunoco_weekly_joined_test = sunoco_weekly_joined[sunoco_weekly_joined.index < '2021-01-01'], sunoco_weekly_joined[sunoco_weekly_joined.index >='2021-01-01']
print('Train:\t', len(sunoco_weekly_joined_train))
print('Test:\t', len(sunoco_weekly_joined_test)) 

prophet_train = index_to_column(sunoco_weekly_joined_train)
prophet_test = index_to_column(sunoco_weekly_joined_test)
prophet_test

In [None]:

prophet_model = Prophet(interval_width=0.95)

prophet_model.fit(prophet_train)
prophet_pred = prophet_model.predict(prophet_test[['ds']])
prophet_pred

In [None]:
mae = round(mean_absolute_error(prophet_test['y'], prophet_pred['yhat']), 3)
print(mae)
plt.figure(figsize=(20,8), dpi=100)
plt.plot(prophet_test['ds'], prophet_test['y'], label='Actual')
plt.plot(prophet_pred['ds'], prophet_pred['yhat'], label='Predicted')
plt.title('Test Forecasting', weight='bold', fontsize=40)
plt.title('Testing Set Forecast', weight='bold', fontsize=25)
plt.legend()

In [None]:
plt.figure(figsize=(10,8))

plt.plot(sunoco_weekly_joined_train.index, sunoco_weekly_joined_train['Implied Gasoline Demand'], label="Training set")
plt.plot(sunoco_weekly_joined_test.index, sunoco_weekly_joined_test['Implied Gasoline Demand'], label="Test set")
plt.axvline(pd.to_datetime('2021-01-01'), color='black', ls='--', lw=2)
plt.text(pd.to_datetime('2021-01-01'), y=40, s='Split', fontsize=10, fontweight='bold')
plt.title('Data Splitting', weight='bold', fontsize=20)
plt.legend()


7-Day and 30-day forecasts with Prophet - use training and test set

In [None]:
# This time, we will use all data (train and test) to train our model
new_sunoco_weekly_joined = index_to_column(sunoco_weekly_joined)
new_sunoco_weekly_joined
# Data Cleaning: Handle missing values

In [None]:
prophet_model2 = Prophet(interval_width=0.95, seasonality_mode="multiplicative")
prophet_model2.fit(new_sunoco_weekly_joined)

future_dates = prophet_model2.make_future_dataframe(periods=7, freq='W')
prophet_pred2 = prophet_model2.predict(future_dates)

plt.figure(figsize=(20,8))

fig = prophet_model2.plot(prophet_pred2, uncertainty=True)
ax = fig.gca()
ax.set_xlim(pd.to_datetime(['2021-01-01', '2023-09-07']))
plt.title('7 Days Forecast', weight='bold', fontsize=25)
plt.show()

In [None]:
prophet_model.plot_components(prophet_pred2)
plt.show()

In [None]:
prophet_pred2 = prophet_pred2.set_index('ds')

In [None]:
sunoco_weekly_joined_with_yhat = sunoco_weekly_joined.join(prophet_pred2['yhat'], how = 'inner')
sunoco_weekly_joined_with_yhat

In [None]:
future_dates2 = prophet_model2.make_future_dataframe(periods=30, freq='W')
prophet_pred3 = prophet_model2.predict(future_dates2)

plt.figure(figsize=(20,8))

fig = prophet_model2.plot(prophet_pred3, uncertainty=True)
ax = fig.gca()
ax.set_xlim(pd.to_datetime(['2021-01-01', '2023-09-15']))
plt.title('30 Days Forecast', weight='bold', fontsize=25)
plt.show()

XGBoost and Random Forest Predictions

In [None]:
sunoco_weekly_joined_with_yhat.index

In [None]:
sunoco_weekly_joined_with_yhat.describe()

In [None]:
sunoco_weekly_joined_with_yhat.info()

In [None]:
sunoco_weekly_joined_with_yhat.index.name = 'Week'
sunoco_weekly_joined_with_yhat.head()

In [None]:
round(((sunoco_weekly_joined_with_yhat.isnull().sum()/len(sunoco_weekly_joined_with_yhat))*100),2)

In [None]:
sunoco_weekly_joined.info()

In [None]:
heat_map_features = sunoco_weekly_joined

# Set Figure Size
plt.figure(figsize=(15,40))

# .corr heatmap of df to visualize correlation & show plot
sns.heatmap(round(heat_map_features.corr(),1),annot=True,cmap='Blues',linewidth=0.9)
plt.show();

In [None]:
correlations = sunoco_weekly_joined.corr(method='pearson')
print(correlations['Implied Gasoline Demand'].sort_values(ascending=False).to_string())

In [None]:
# Create Target variable
target='Implied Gasoline Demand'

# Split data into feature matrix and target vector
y,X=sunoco_weekly_joined[target],sunoco_weekly_joined.drop(columns=target)

# split data into train / validation sets
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=.2,random_state=42)

In [None]:
y_train

In [None]:
y_pred = [y_train.mean()]*len(y_train)
mean_baseline_pred = y_train.mean()
baseline_mae = mean_absolute_error(y_train,y_pred)
baseline_rmse = mean_squared_error(y_train,y_pred,squared=False)

# Print statement to show all baseline values
print('Mean Price Per KW/h Baseline Pred:', mean_baseline_pred)
print('-------------------------------------------------------------------')
print('Baseline Mae:',baseline_mae)
print('-------------------------------------------------------------------')
print('Baseline RMSE:',baseline_rmse)

In [None]:
ordinal = OrdinalEncoder()
ordinal_fit = ordinal.fit(X_train)
XT_train = ordinal.transform(X_train)
XT_val = ordinal.transform(X_val)
XT_train

In [None]:
simp = SimpleImputer(strategy='mean')
simp_fit = simp.fit(XT_train)
XT_train = simp.transform(XT_train)
XT_val = simp.transform(XT_val)
XT_val[2]

In [None]:
# Assigning model variables
model_rfr = RandomForestRegressor()
model_xgbr=XGBRegressor()

# Fitting models
model_rfr.fit(XT_train,y_train);
model_xgbr.fit(XT_train,y_train);

# Def to check model metrics of baseline performance
def check_metrics(model):
    print(model)
    print('===================================================================')
    print('Training MAE:', mean_absolute_error(y_train,model.predict(XT_train)))
    print('-------------------------------------------------------------------')
    print('Validation MAE:', mean_absolute_error(y_val,model.predict(XT_val)))
    print('-------------------------------------------------------------------')
    print('Validation R2 score:', model.score(XT_val,y_val))
    print('===================================================================')
model = [model_xgbr,model_rfr]
for m in model:
  check_metrics(m)

Random Forest Regressor seems to perform a little better than XGBRegressor in this model without the y-hat.  The Validation R2 score of the Random Forest Regressor is approximately 0.862, whereas the Validation R2 score of the XGBoost Regressor is approximately 0.854. The validation mean absolute error for the Random Forest Regressor is smaller compared to with XGBoost without y-hat

Time Series Analysis with y-hat

In [None]:
heat_map_features = sunoco_weekly_joined_with_yhat

# Set Figure Size
plt.figure(figsize=(15,12.5))

# .corr heatmap of df to visualize correlation & show plot
sns.heatmap(round(heat_map_features.corr(),1),annot=True,cmap='Blues',linewidth=0.9)
plt.show();

In [None]:
correlations = sunoco_weekly_joined_with_yhat.corr(method='pearson')
print(correlations['Implied Gasoline Demand'].sort_values(ascending=False).to_string())

In [None]:

# Create Target variable
target='Implied Gasoline Demand'

# Split data into feature matrix and target vector
y,X=sunoco_weekly_joined_with_yhat[target],sunoco_weekly_joined_with_yhat.drop(columns=target)

# split data into train / validation sets
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=.2,random_state=42)

In [None]:
y_train

In [None]:
y_pred = [y_train.mean()]*len(y_train)
mean_baseline_pred = y_train.mean()
baseline_mae = mean_absolute_error(y_train,y_pred)
baseline_rmse = mean_squared_error(y_train,y_pred,squared=False)

# Print statement to show all baseline values
print('Mean Price Per KW/h Baseline Pred:', mean_baseline_pred)
print('-------------------------------------------------------------------')
print('Baseline Mae:',baseline_mae)
print('-------------------------------------------------------------------')
print('Baseline RMSE:',baseline_rmse)

In [None]:
ordinal = OrdinalEncoder()
ordinal_fit = ordinal.fit(X_train)
XT_train = ordinal.transform(X_train)
XT_val = ordinal.transform(X_val)
XT_train
XT_val

In [None]:
# One Hot Encoder to transform Seasons column
onehot = OneHotEncoder()
onehot_fit = onehot.fit(X_train)
XT_train = onehot.transform(X_train)
XT_val = onehot.transform(X_val)
XT_val

In [None]:
# Simple imputer to fill nan values, then transform sets
simp = SimpleImputer(strategy='mean')
simp_fit = simp.fit(XT_train)
XT_train = simp.transform(XT_train)
XT_val = simp.transform(XT_val)
XT_val[2]

In [None]:
# Assigning model variables
model_rfr = RandomForestRegressor()
model_xgbr=XGBRegressor()

# Fitting models
model_rfr.fit(XT_train,y_train);
model_xgbr.fit(XT_train,y_train);

# Def to check model metrics of baseline performance
def check_metrics(model):
    print(model)
    print('===================================================================')
    print('Training MAE:', mean_absolute_error(y_train,model.predict(XT_train)))
    print('-------------------------------------------------------------------')
    print('Validation MAE:', mean_absolute_error(y_val,model.predict(XT_val)))
    print('-------------------------------------------------------------------')
    print('Validation R2 score:', model.score(XT_val,y_val))
    print('===================================================================')
model = [model_xgbr,model_rfr]
for m in model:
  check_metrics(m)

With the y-hat variable, Random Forest Regressor performs better than XGBoost Regressor with the Validation R2 score with the Random Forest Regressor being 0.87768, compared to that of XGBoost (0.8713). However, the difference in performance level between XGBoost and Random Forest Regressor is smaller with y-hat compared to without the y-hat. The Validation mean absolute error in the Random Forest Regressor is 187.7 (which is a smaller MAE compared to Random Forest Regressor without the y-hat variable). Additionally, while the XGBoost Validation Mean Absolute Error is greater than that of Random Forest even with the y-hat, the value is reduced to 194.8 with the y-hat from 211.7 without the y-hat. 

In [None]:
%autosave 15


Both XGBoost and Random Forest Regressor Time Series Analysis perform better with the y-hat variable compared to without the y-hat variable. 

In [None]:
#TODO: 3 Tuning in the Random Forest Regressor and XGB Regressor (data cleaning - use wrangle function)


#4 Feature Importance

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(model_xgbr, XT_val, y_val,
                          n_repeats=30,
                         random_state=0)
imdf=pd.DataFrame(result.importances_mean).T
importances = imdf.set_axis(onehot_fit.get_feature_names_out(), axis=1)
importances.T[0].sort_values(ascending=False) 

#5. Conclusions/major findings regarding IV's importance to impacting DV:
1. yhat is important when predicting Implied Gasoline Demand
2. Crude oil prices, Employee Earnings, and Regular Gasoline appear to be the most significant predictors of Implied Gasoline Demand (as those predictors have most prominent feature importance)
3. Consumer loans, Housing Market Expectations, Federal Funds Effective Rate, Personal Consumption Expenditures, US GDP, US Dollar Index, Inflationary Expectations, Rig Count, U.S. Demographics, and the total EV Regulations show moderate feature importance, meaning that they have a moderate impact on gasoline demand.
4. US Spot Market, Labor Costs, Refinery Util, and Automobile Sales appear to have little to no impact on predicting gasoline demand as their feature importance is small. 

#6. Are our hypotheses supported or rejected?
1. Our hypothesis concerning the effect of U.S. Demographics on Implied Gasoline Demand is rejected as there is a positive (not a negative) correlation between U.S. population and Implied Gasoline Demand. Additonally, our hypothetical magnitude is rejected as the correlation coefficient is moderate/medium correlation value (as opposed to less than 0.3).
2. Our Hypothesis regarding EV Sales is wrong as our hypothesis suggests that EV Sales have a high impact on Implied Gasoline demand, while the correlation coefficient produced suggests that this variable has no magnitude/impact on the DV (even with a negative value). 
3. EV Regulations does appear to have a meaningful magnitude on Gasoline Demand (supporting the magnitude side of the hypothesis). However, it does not have a negative correlation with Implied Gasoline Demand as the correlation coefficient is positive (meaning that the initial hypothesis is partially rejected).
4. Our Hypothesis concerning Regular Gasoline Prices appears to be partially supported (the magnitude is correct, with the correlation coefficent indicating a medium correlation between Regular Gasoline Prices and Implied Gasoline Prices. However, the hypothetical direction is incorrect as the correlation coefficient suggests a positive impact on Implied Gasoline Demand (not a negative one). 
5. Our hypothesis concerning Crude Oil Prices is partially supported (our magnitude is supported, but the hypothetical direction is rejected). Crude Oil Prices have a moderate effect on Implied Gasoline Demand (given that the correlation coefficient is greater than 0.3). However, given that the correlation coefficient is positive, the hypothetical direction is rejected. 
6. Our hypothesis regarding U.S. GDP appears to be partially correct. The correlation coefficent between US GDP and Implied Gasoline Demand rejects the magnitude as the correlation coefficient (0.495) suggests that US GDP has a medium magnitude impact on Implied Gasoline Demand. However, the direction in which U.S. GDP impacts the DV appears to be supported as the correlation coefficient is positive (which adds up to our hypothesis).
7. Hypothesis regarding Rig Count appears to be supported by our findings. The correlation coefficient between Rig Count and Implied Gasoline Demand supports that this predictor has a moderate impact and a positive correlation on Gasoline Demand.
8. Hypothesis relating to Federal Funds Effective Rate and how it impacts gasoline demand is partially correct as it does have a medium significance on predicting Implied Gasoline Demand, but the direction of the relationship is  positive, not negative as previously predicted.
9. Hypotheses relating to the impact and direction of Housing Market Expectations and Inflationary Expectations on Implied Gasoline Demand appears to be correct as both have a low impact on gasoline demand and are inversely related to the DV.
10. Our hypothesis regarding consumer credit data and US Spot market and their impact on Implied Gasoline demand is rejected as the Feature Importance does indicate that these variables have low (if any) impact on predictions of Gasoline Demand. 
11. Our hypothesis about total ridership was rejected, as total ridership is positively correlated with Gasoline Demand, and the magnitude is low, as it has a low importance value. 
12. Our hypothesis about refinery utilization were partially correct. This was a relatively important feature, but its magnitude was not high. However, both variables - US gross inputs to refineries and US percent utilization of refineries are positively correlated with Gasoline Demand.
13. Our hypothesis about jobless claims was partially correct, as it is negatively correlated. However, the magnitude of thhis variable is not high, as it has a low importance value. 
14. Our hypothesis about automobile sales was partially correct, as it is positively correlated to Implied Gasoline Demand. However, the magnitude of this variable is not high, as it is not a strong correlation. 

#7. Implications of my conclusions
1. As U.S. population increases in the near future, we can predict that future Gasoline Demand will increase alongside it (especially with more Americans being at least 16 years of age (driving age)). 
2. The more EV Regulations that are passed in the future will result in Gasoline Demand increasing. Though, EV Sales will have a very low impact on increasing gasoline demand. 
3. As employee earnings are constantly increasing, we are likely to see an increase in gasoline demand as there is more revenue for the consumer to spend.

In [1]:
sunoco_weekly_joined_with_yhat.to_csv('sunoco_weekly_joined_with_yhat')

NameError: name 'sunoco_weekly_joined_with_yhat' is not defined