In [1]:
import os
import sys
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark
!apt-get update
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable

sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python"))
sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python", "lib", "py4j-0.10.9.7-src.zip"))  # Adjust the version if needed

# Start a SparkSession
import findspark
findspark.init()

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,321 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,533 kB]
Get:13 http://archive.ubuntu.com/ubuntu 

In [2]:
from pyspark.sql import SparkSession
import pandas as pd

# Initialize Spark
spark = SparkSession.builder.appName("GPR_Model").getOrCreate()

# Load data into Spark DataFrame
spark_df = spark.read.csv("cleaned_wcs_data.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas for TensorFlow training
df = spark_df.toPandas()
df.head()

Unnamed: 0,Year,Month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA,WCS
0,2005,1,100.89,78.2,128.53,0.17,2.46,29.42
1,2005,2,108.98,104.77,108.8,0.2,2.62,28.44
2,2005,3,90.41,87.23,87.88,0.2,2.12,36.5
3,2005,4,87.03,72.53,102.85,0.16,1.94,31.02
4,2005,5,98.21,89.93,104.91,0.16,2.28,27.46


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# df = pd.get_dummies(df)

# Define Features and Target
X = df.drop(columns=['WCS'])
y = df["WCS"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Define the number of input features
number_input_features = X_train_scaled.shape[1]

# Define the model
nn = tf.keras.models.Sequential()

# Add layers to the model
nn.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=X_train_scaled.shape[1]))
nn.add(tf.keras.layers.Dense(units=50, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

# Complile the model
nn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
from tensorflow.keras.callbacks import EarlyStopping

# # Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# # Train the model
history = nn.fit(X_train_scaled, y_train,
                 validation_data=(X_test_scaled, y_test),
                 epochs=100, batch_size=16,
                 verbose=1)

Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 3322.5764 - mean_absolute_error: 54.0684 - val_loss: 3663.2585 - val_mean_absolute_error: 57.3172
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2973.9478 - mean_absolute_error: 50.6940 - val_loss: 3565.9873 - val_mean_absolute_error: 56.4776
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 2895.1460 - mean_absolute_error: 50.2064 - val_loss: 3437.9856 - val_mean_absolute_error: 55.3561
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2997.8025 - mean_absolute_error: 50.4551 - val_loss: 3265.5999 - val_mean_absolute_error: 53.7991
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2628.0166 - mean_absolute_error: 47.6785 - val_loss: 3030.0864 - val_mean_absolute_error: 51.6551
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━

In [6]:
from sklearn.metrics import r2_score

# Evaluate on test data
y_pred = nn.predict(X_test_scaled)

# Compute R-squared
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"""
Model Performance
-----------------
R² Score: {r2:.2f}
Test Loss: {loss:.4f}
Test MAE: {mae:.4f}
      """)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 381.0224 - mean_absolute_error: 13.9035

Model Performance
-----------------
R² Score: 0.04
Test Loss: 363.7554
Test MAE: 13.6851
      


In [8]:
import csv

# Save model results in CSV
model_results = [
    ["Experiment", "Hidden Layer 1", "Hidden Layer 2", "Batch Size", "Epochs", "Validation Loss", "Test MAE", "R² Score"],
    [1, 80, 50, 16, len(history.history['loss']), min(history.history['val_loss']), mae, r2]
]

# Write to CSV
with open("wcs_model_optimization_log.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(model_results)

print("\n🔍 Model results saved in 'model_optimization_log.csv'")


🔍 Model results saved in 'model_optimization_log.csv'
