In [1]:
#Load Packages
import datetime as dt
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sqlalchemy import create_engine
from config import db_password

In [2]:
connection_string = f"postgresql://postgres:{db_password}@localhost:5432/gas_stock"

In [3]:
# Using sqlalchemy
engine = create_engine(connection_string)

In [4]:
from sqlalchemy import inspect
insp = inspect(engine)
insp.get_table_names()

['sp500',
 'usgasstorage',
 'usregulargasprice',
 'gasdata',
 'draftdata',
 'oilcompaniesstockdata',
 'finalmaindata']

In [5]:
df = pd.read_sql('SELECT * from finalmaindata', engine)

In [6]:
# Following Processes are change the week date column into excel general format

#1 separate Year Month and Day
df['Year'] = pd.to_datetime(df['weekly_date']).dt.year
df['Month'] = pd.to_datetime(df['weekly_date']).dt.month
df['Day'] = pd.to_datetime(df['weekly_date']).dt.day


#2 write a function to make format changed
# datetime.datetime to excel serial date number
 
# Importing date module from datetime
from datetime import date
 
# Taking the parameter from the calling function
def convert_date_to_excel_ordinal(year, month, day):
 
    # Specifying offset value i.e.,
    # the date value for the date of 1900-01-00
    offset = 693594
    current = date(year, month, day)
 
    # Calling the toordinal() function to get
    # the excel serial date number in the form
    # of date values
    n = current.toordinal()
    return (n - offset)

In [7]:
#change dates to excel general number format
mylist = []
i =0
while i < len(df['Year']):
    a = convert_date_to_excel_ordinal(df['Year'][i],df['Month'][i],df['Day'][i])
    mylist.append(a)
    i += 1

In [8]:
# create a new list for that excel general format
df['Week'] = mylist

In [9]:
# shift column 'C' to first position
first_column = df.pop('Week')
  
# insert column using insert(position,column_name,first_column) function
df.insert(0, 'Week', first_column)

In [10]:
#drop below non using columns
df.drop(['weekly_date', 'Year', 'Month', 'Day','xom','cvx'], axis=1, inplace=True)

In [11]:
#check types
df.dtypes

Week                                       int64
weekly_ending_stocks_of_gasoline         float64
weekly_regular_retail_gasoline_prices    float64
gspc_open                                float64
gspc_high                                float64
gspc_low                                 float64
gspc_close                               float64
gspc_volume                              float64
gspc_adjusted                            float64
dtype: object

In [12]:
#check any NAs
df.isnull().sum()

Week                                     0
weekly_ending_stocks_of_gasoline         0
weekly_regular_retail_gasoline_prices    0
gspc_open                                0
gspc_high                                0
gspc_low                                 0
gspc_close                               0
gspc_volume                              0
gspc_adjusted                            0
dtype: int64

In [13]:
df = df.sort_values(by='Week', ascending=False)
df['Stock_Price_Diff'] = df['gspc_close'] - df['gspc_close'].shift(-1)

In [14]:
df['Gas_Price_Diff'] = df['weekly_regular_retail_gasoline_prices'] - df['weekly_regular_retail_gasoline_prices'].shift(-1)

In [15]:
df['Stock_Perc_change'] = df['Stock_Price_Diff'] / df['gspc_close']*100
df['Gas_Perc_change'] = df['Gas_Price_Diff'] / df['weekly_regular_retail_gasoline_prices'] *100

In [16]:
df['Gas_change'] = np.where(df['Gas_Perc_change'] >= 0, 1, 0)
df['Stock_change'] = np.where(df['Stock_Perc_change'] >= 0, 1, 0)

In [17]:
df.head(20)

Unnamed: 0,Week,weekly_ending_stocks_of_gasoline,weekly_regular_retail_gasoline_prices,gspc_open,gspc_high,gspc_low,gspc_close,gspc_volume,gspc_adjusted,Stock_Price_Diff,Gas_Price_Diff,Stock_Perc_change,Gas_Perc_change,Gas_change,Stock_change
1060,44718,217474.0,4.876,4134.72,4168.78,4109.18,4121.43,3852050000.0,4121.43,147.68,0.283,3.583222,5.803938,1,1
1059,44704,218996.0,4.593,3919.42,3981.88,3909.04,3973.75,3392770000.0,3973.75,-34.26,0.102,-0.862158,2.220771,1,0
1058,44697,219707.0,4.491,4013.02,4046.46,3983.99,4008.01,3824320000.0,4008.01,16.77,0.163,0.418412,3.629481,1,1
1057,44690,220189.0,4.328,4081.27,4081.27,3975.48,3991.24,4746120000.0,3991.24,-164.14,0.146,-4.112506,3.373383,1,0
1056,44683,224968.0,4.182,4130.61,4169.81,4062.51,4155.38,4474060000.0,4155.38,-140.74,0.075,-3.386935,1.7934,1,0
1055,44676,228575.0,4.107,4255.34,4299.02,4200.82,4296.12,4061070000.0,4296.12,-95.57,0.041,-2.224565,0.998296,1,0
1054,44669,230805.0,4.066,4385.63,4410.31,4370.3,4391.69,3509340000.0,4391.69,-20.84,-0.025,-0.474533,-0.614855,0,0
1053,44662,232378.0,4.091,4462.64,4464.35,4408.38,4412.53,3452540000.0,4412.53,-170.11,-0.079,-3.855158,-1.931068,0,0
1052,44655,233139.0,4.17,4547.97,4583.5,4539.21,4582.64,3833500000.0,4582.64,7.12,-0.061,0.155369,-1.46283,0,1
1051,44648,236787.0,4.231,4541.09,4575.65,4517.69,4575.52,3696850000.0,4575.52,114.34,-0.008,2.498951,-0.189081,0,1


In [18]:
# Define features set
X = df.copy()
X = X.drop("Gas_change", axis=1)
X = X.drop("Week", axis=1)
X = X.drop("gspc_adjusted", axis=1)
X = X.drop("Stock_Price_Diff", axis=1)
X = X.drop("Gas_Price_Diff", axis=1)
X = X.drop("gspc_volume", axis=1)
X = X.drop("Gas_Perc_change", axis=1)
X = X.drop("weekly_regular_retail_gasoline_prices", axis=1)

X

Unnamed: 0,weekly_ending_stocks_of_gasoline,gspc_open,gspc_high,gspc_low,gspc_close,Stock_Perc_change,Stock_change
1060,217474.0,4134.72,4168.78,4109.18,4121.43,3.583222,1
1059,218996.0,3919.42,3981.88,3909.04,3973.75,-0.862158,0
1058,219707.0,4013.02,4046.46,3983.99,4008.01,0.418412,1
1057,220189.0,4081.27,4081.27,3975.48,3991.24,-4.112506,0
1056,224968.0,4130.61,4169.81,4062.51,4155.38,-3.386935,0
...,...,...,...,...,...,...,...
4,199593.0,1424.37,1427.15,1413.33,1424.24,2.090940,1
3,200937.0,1360.16,1394.48,1350.14,1394.46,-0.507006,0
2,201447.0,1441.36,1454.09,1395.42,1401.53,-4.000628,0
1,198240.0,1441.47,1464.36,1441.47,1457.60,0.163282,1


In [19]:
#check any NAs
X.isnull().sum()

weekly_ending_stocks_of_gasoline    0
gspc_open                           0
gspc_high                           0
gspc_low                            0
gspc_close                          0
Stock_Perc_change                   1
Stock_change                        0
dtype: int64

In [30]:
X = X.fillna(0.000001)

In [21]:
#X = df.drop(0)

In [31]:
X

Unnamed: 0,weekly_ending_stocks_of_gasoline,gspc_open,gspc_high,gspc_low,gspc_close,Stock_Perc_change,Stock_change
1060,217474.0,4134.72,4168.78,4109.18,4121.43,3.583222,1
1059,218996.0,3919.42,3981.88,3909.04,3973.75,-0.862158,0
1058,219707.0,4013.02,4046.46,3983.99,4008.01,0.418412,1
1057,220189.0,4081.27,4081.27,3975.48,3991.24,-4.112506,0
1056,224968.0,4130.61,4169.81,4062.51,4155.38,-3.386935,0
...,...,...,...,...,...,...,...
4,199593.0,1424.37,1427.15,1413.33,1424.24,2.090940,1
3,200937.0,1360.16,1394.48,1350.14,1394.46,-0.507006,0
2,201447.0,1441.36,1454.09,1395.42,1401.53,-4.000628,0
1,198240.0,1441.47,1464.36,1441.47,1457.60,0.163282,1


In [32]:
# Define target vector
y = df["Gas_change"].values

In [33]:
X.isnull().sum()

weekly_ending_stocks_of_gasoline    0
gspc_open                           0
gspc_high                           0
gspc_low                            0
gspc_close                          0
Stock_Perc_change                   0
Stock_change                        0
dtype: int64

In [34]:
X

Unnamed: 0,weekly_ending_stocks_of_gasoline,gspc_open,gspc_high,gspc_low,gspc_close,Stock_Perc_change,Stock_change
1060,217474.0,4134.72,4168.78,4109.18,4121.43,3.583222,1
1059,218996.0,3919.42,3981.88,3909.04,3973.75,-0.862158,0
1058,219707.0,4013.02,4046.46,3983.99,4008.01,0.418412,1
1057,220189.0,4081.27,4081.27,3975.48,3991.24,-4.112506,0
1056,224968.0,4130.61,4169.81,4062.51,4155.38,-3.386935,0
...,...,...,...,...,...,...,...
4,199593.0,1424.37,1427.15,1413.33,1424.24,2.090940,1
3,200937.0,1360.16,1394.48,1350.14,1394.46,-0.507006,0
2,201447.0,1441.36,1454.09,1395.42,1401.53,-4.000628,0
1,198240.0,1441.47,1464.36,1441.47,1457.60,0.163282,1


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
### Choose best learning rate
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.657
Accuracy score (validation): 0.575

Learning rate:  0.1
Accuracy score (training): 0.693
Accuracy score (validation): 0.560

Learning rate:  0.25
Accuracy score (training): 0.757
Accuracy score (validation): 0.560

Learning rate:  0.5
Accuracy score (training): 0.795
Accuracy score (validation): 0.534

Learning rate:  0.75
Accuracy score (training): 0.825
Accuracy score (validation): 0.556

Learning rate:  1
Accuracy score (training): 0.835
Accuracy score (validation): 0.538



In [37]:
### Create Gradient Boosting Classifier
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,0,0
3,0,0
4,1,0
5,0,0
6,1,1
7,0,0
8,1,1
9,0,1


In [38]:
#EVALUATE THE MODEL
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5338345864661654


In [39]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68,65
Actual 1,59,74


In [40]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.54      0.51      0.52       133
           1       0.53      0.56      0.54       133

    accuracy                           0.53       266
   macro avg       0.53      0.53      0.53       266
weighted avg       0.53      0.53      0.53       266

