In [1]:
import os
import sys
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark
!apt-get update
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable

sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python"))
sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python", "lib", "py4j-0.10.9.7-src.zip"))  # Adjust the version if needed

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.83)] [Connected to cloud.r-project.org (3.171.85.                                                                                                    Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (3.171.85.15)] [Connected to r2u.stat.ill                                                                                                    Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)] [Waiting for headers]                                                                                                    Hit:

In [2]:
from pyspark.sql import SparkSession
import pandas as pd

# Initialize Spark
spark = SparkSession.builder.appName("GPR_Model").getOrCreate()

# Load data into Spark DataFrame
spark_df = spark.read.csv("cleaned_data.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas for TensorFlow training
df = spark_df.toPandas()
df.head()

Unnamed: 0,Year,Month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA,WTI
0,1986,1,135.36,137.67,166.02,0.56,3.38,22.93
1,1986,2,98.75,84.02,114.82,0.29,2.3,15.46
2,1986,3,98.68,85.13,117.56,0.24,2.43,12.61
3,1986,4,148.31,142.49,182.87,0.63,4.08,12.84
4,1986,5,117.39,126.68,114.2,0.57,2.92,15.38


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# df = pd.get_dummies(df)

# Define Features and Target
X = df.drop(columns=['WTI'])
y = df["WTI"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Define the number of input features
number_input_features = X_train_scaled.shape[1]

# Define the model
nn = tf.keras.models.Sequential()

# Add layers to the model
nn.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=X_train_scaled.shape[1]))
nn.add(tf.keras.layers.Dense(units=50, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

# Complile the model
nn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
from tensorflow.keras.callbacks import EarlyStopping

# # Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# # Train the model
history = nn.fit(X_train_scaled, y_train,
                 validation_data=(X_test_scaled, y_test),
                 epochs=100, batch_size=16,
                 verbose=1)

Epoch 1/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - loss: 3141.0969 - mean_absolute_error: 47.9162 - val_loss: 2887.7397 - val_mean_absolute_error: 45.2864
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 3219.5432 - mean_absolute_error: 47.9031 - val_loss: 2666.3049 - val_mean_absolute_error: 42.8235
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 2808.6482 - mean_absolute_error: 43.8024 - val_loss: 2271.0979 - val_mean_absolute_error: 38.1751
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2324.8127 - mean_absolute_error: 38.1050 - val_loss: 1703.6127 - val_mean_absolute_error: 31.4329
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 1971.7231 - mean_absolute_error: 33.5996 - val_loss: 1107.6632 - val_mean_absolute_error: 24.5087
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━

In [6]:
from sklearn.metrics import r2_score

# Evaluate on test data
y_pred = nn.predict(X_test_scaled)

# Compute R-squared
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"""
Model Performance
-----------------
R² Score: {r2:.2f}
Test Loss: {loss:.4f}
Test MAE: {mae:.4f}
      """)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 144.1497 - mean_absolute_error: 7.6649

Model Performance
-----------------
R² Score: 0.82 
Test Loss: 151.5137 
Test MAE: 7.9284
      


In [7]:
import csv

# Save model results in CSV
model_results = [
    ["Experiment", "Hidden Layer 1", "Hidden Layer 2", "Batch Size", "Epochs", "Validation Loss", "Test MAE", "R² Score"],
    [1, 80, 50, 16, len(history.history['loss']), min(history.history['val_loss']), mae, r2]
]

# Write to CSV
with open("model_optimization_log.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(model_results)

print("\n🔍 Model results saved in 'model_optimization_log.csv'")


🔍 Model results saved in 'model_optimization_log.csv'
