In [8]:
import os
import sys
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark
!apt-get update
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable

sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python"))
sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python", "lib", "py4j-0.10.9.7-src.zip"))  # Adjust the version if needed

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 14.2 kB/129 kB 11%] [Connected t                                                                                                    Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 48.9 kB/129 kB 38%] [Connecting                                                                                                     Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 51.8 kB/129 kB 40%] [Connected t0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connected to r2u.stat.illinois.edu (192.17.1                                                                                                    Hit:4 http://archive.u

In [10]:
from pyspark.sql import SparkSession
import pandas as pd

# Initialize Spark
spark = SparkSession.builder.appName("GPR_Model").getOrCreate()

# Load data into Spark DataFrame
spark_df = spark.read.csv("cleaned_wti_data.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas for TensorFlow training
df = spark_df.toPandas()
df.head()

Unnamed: 0,Year,Month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA,WTI
0,1986,1,135.36,137.67,166.02,0.56,3.38,22.93
1,1986,2,98.75,84.02,114.82,0.29,2.3,15.46
2,1986,3,98.68,85.13,117.56,0.24,2.43,12.61
3,1986,4,148.31,142.49,182.87,0.63,4.08,12.84
4,1986,5,117.39,126.68,114.2,0.57,2.92,15.38


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# df = pd.get_dummies(df)

# Define Features and Target
X = df.drop(columns=['WTI'])
y = df["WTI"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Define the number of input features
number_input_features = X_train_scaled.shape[1]

# Define the model
nn = tf.keras.models.Sequential()

# Add layers to the model
nn.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=X_train_scaled.shape[1]))
nn.add(tf.keras.layers.Dense(units=50, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

# Complile the model
nn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
from tensorflow.keras.callbacks import EarlyStopping

# # Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# # Train the model
history = nn.fit(X_train_scaled, y_train,
                 validation_data=(X_test_scaled, y_test),
                 epochs=100, batch_size=16,
                 verbose=1)

Epoch 1/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 3341.1472 - mean_absolute_error: 49.3683 - val_loss: 2860.7700 - val_mean_absolute_error: 44.9756
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 3143.1707 - mean_absolute_error: 47.0194 - val_loss: 2621.3228 - val_mean_absolute_error: 42.2555
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2762.7476 - mean_absolute_error: 43.0186 - val_loss: 2171.9202 - val_mean_absolute_error: 36.8511
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2369.9536 - mean_absolute_error: 37.6735 - val_loss: 1560.4614 - val_mean_absolute_error: 29.9979
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1773.7466 - mean_absolute_error: 32.6839 - val_loss: 1051.6885 - val_mean_absolute_error: 24.8556
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━━━━━

In [14]:
from sklearn.metrics import r2_score

# Evaluate on test data
y_pred = nn.predict(X_test_scaled)

# Compute R-squared
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"""
Model Performance
-----------------
R² Score: {r2:.2f}
Test Loss: {loss:.4f}
Test MAE: {mae:.4f}
      """)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 152.9231 - mean_absolute_error: 7.9876

Model Performance
-----------------
R² Score: 0.80
Test Loss: 164.6549
Test MAE: 8.2981
      


In [16]:
import csv

# Save model results in CSV
model_results = [
    ["Experiment", "Hidden Layer 1", "Hidden Layer 2", "Batch Size", "Epochs", "Validation Loss", "Test MAE", "R² Score"],
    [1, 80, 50, 16, len(history.history['loss']), min(history.history['val_loss']), mae, r2]
]

# Write to CSV
with open("wti_model_optimization_log.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(model_results)

print("\n🔍 Model results saved in 'model_optimization_log.csv'")


🔍 Model results saved in 'model_optimization_log.csv'
