WTI price percentage change vs GDP Model

In [2]:
import pandas as pd
from pathlib import Path

# Define file paths for uploaded files
gdp_file = "Cleaned Data Csv/Canada_USA_GDP_2014-2024.csv"
wti_file = "Cleaned Data Csv/WTI_data_year_only_2014_2024.csv"

# Load datasets
gdp_df = pd.read_csv(gdp_file)
wti_df = pd.read_csv(wti_file)

# Remove unnecessary columns (such as unnamed index columns if present)
gdp_df = gdp_df.loc[:, ~gdp_df.columns.str.contains('Unnamed', case=False, na=False)]
wti_df = wti_df.loc[:, ~wti_df.columns.str.contains('Unnamed', case=False, na=False)]

# Standardize column names by stripping spaces and converting to lowercase
gdp_df.columns = gdp_df.columns.str.strip().str.lower()
wti_df.columns = wti_df.columns.str.strip().str.lower()

# Ensure 'year' column exists and convert to integer
if "year" in gdp_df.columns and "year" in wti_df.columns:
    gdp_df["year"] = pd.to_numeric(gdp_df["year"], errors="coerce").astype("Int64")
    wti_df["year"] = pd.to_numeric(wti_df["year"], errors="coerce").astype("Int64")

# Clean WTI percent change column (remove '%' and convert to float)
if "percentchange" in wti_df.columns:
    wti_df["percentchange"] = (
        wti_df["percentchange"]
        .str.replace("%", "", regex=True)
        .astype(float)
    )

# Rename columns for clarity (if they exist)
gdp_df.rename(
    columns={"gdp per capita (constant 2015 us$)": "gdp_per_capita"}, inplace=True
)
wti_df.rename(
    columns={"price": "wti_price", "percentchange": "wti_percent_change"}, inplace=True
)

# Remove duplicates
gdp_df.drop_duplicates(inplace=True)
wti_df.drop_duplicates(inplace=True)

# Drop any remaining NaN values
gdp_df.dropna(inplace=True)
wti_df.dropna(inplace=True)

# Merge cleaned datasets on the year column
cleaned_merged_df = gdp_df.merge(wti_df, on="year", how="inner")

# Save cleaned dataset to a CSV file for download
cleaned_file_path = "Cleaned Data Csv/Cleaned_GDP_WTI_Data.csv"
cleaned_merged_df.to_csv(cleaned_file_path, index=False)

# Provide download link
cleaned_file_path

'Cleaned Data Csv/Cleaned_GDP_WTI_Data.csv'

In [3]:
cleaned_merged_df

Unnamed: 0,country name,country code,year,gdp_per_capita,wti_price,wti_percent_change
0,Canada,CAN,2014,43643.24,97.49,-0.94
1,United States,USA,2014,55817.56,97.49,-0.94
2,Canada,CAN,2015,43594.19,48.24,-9.44
3,United States,USA,2015,57040.21,48.24,-9.44
4,Canada,CAN,2016,43551.34,33.62,-9.23
5,United States,USA,2016,57658.67,33.62,-9.23
6,Canada,CAN,2017,44339.39,52.81,-1.69
7,United States,USA,2017,58703.14,52.81,-1.69
8,Canada,CAN,2018,44907.34,64.73,7.13
9,United States,USA,2018,60127.21,64.73,7.13


Part 2: Trained and intiialized the model using Python

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = cleaned_merged_df.copy()
X = df.drop(columns=['wti_percent_change']) 
y = df['wti_percent_change']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)

Training set: (16, 5) (16,)
Testing set: (4, 5) (4,)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Copy the cleaned DataFrame
df = cleaned_merged_df.copy()

# Identify categorical columns
categorical_cols = ['country name', 'country code']  # Removed 'type_' as it's not in your dataset

# One-Hot Encode categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define Features (X) and Target (y)
X = df.drop(columns=['wti_percent_change'])  # Features
y = df['wti_percent_change']  # Target variable

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()

# Fit the scaler on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test set
X_test_scaled = scaler.transform(X_test)

# Display the shape of transformed datasets
print("Scaled Training set shape:", X_train_scaled.shape)
print("Scaled Testing set shape:", X_test_scaled.shape)


Scaled Training set shape: (16, 5)
Scaled Testing set shape: (4, 5)


Train and Evaluate the Model

In [6]:
import tensorflow as tf

# Get number of input features (columns in X_train)
number_input_features = X_train_scaled.shape[1]  

# Define the number of nodes for each hidden layer
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 50

# Initialize the neural network
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer for regression (WTI price change prediction)
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))  # Linear for regression

# Check model architecture
nn.summary()

# Compile the model
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=100, batch_size=16, validation_split=0.2, verbose=1)

# Evaluate the model
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {loss:.4f}, Test MAE: {mae:.4f}")


2025-03-01 14:01:40.070913: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 126.8798 - mae: 10.0678 - val_loss: 101.6808 - val_mae: 7.4902
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - loss: 126.1170 - mae: 10.0377 - val_loss: 101.8041 - val_mae: 7.4781
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - loss: 125.3561 - mae: 10.0072 - val_loss: 101.9229 - val_mae: 7.4664
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step - loss: 124.6164 - mae: 9.9773 - val_loss: 102.1093 - val_mae: 7.4571
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - loss: 123.9097 - mae: 9.9488 - val_loss: 102.3162 - val_mae: 7.4499
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step - loss: 123.2274 - mae: 9.9216 - val_loss: 102.5242 - val_mae: 7.4441
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [7]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - 44ms/step - loss: 131.6434 - mae: 11.3536
Loss: 131.64340209960938, Accuracy: 11.353561401367188


Part 3: Retriveing the Model using Spark and model optimization 

In [8]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.3'
spark_version = 'spark-3.5.4'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

zsh:1: command not found: apt-get
zsh:1: command not found: apt-get
zsh:1: command not found: wget
tar: Error opening archive: Failed to open 'spark-3.5.4-bin-hadoop3.tgz'


Exception: Unable to find py4j in /content/spark-3.5.4-bin-hadoop3/python, your SPARK_HOME may not be configured correctly

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("WTI_Model").getOrCreate()

# Load data into Spark DataFrame
spark_df = spark.read.csv("Cleaned Data Csv/cleaned_merged_df.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas for TensorFlow training
df = spark_df.toPandas()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['country name', 'country code'], drop_first=True)

# Define Features and Target
X = df.drop(columns=['wti_percent_change'])  
y = df['wti_percent_change']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Define the number of input features
number_input_features = X_train_scaled.shape[1]

# Initialize the model
nn = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=80, input_dim=number_input_features, activation="relu"),
    tf.keras.layers.Dense(units=50, activation="relu"),
    tf.keras.layers.Dense(units=1, activation="linear")  # Linear for regression
])

# Compile the model
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train the model and log results
history = nn.fit(X_train_scaled, y_train, 
                 validation_data=(X_test_scaled, y_test), 
                 epochs=100, batch_size=16, 
                 callbacks=[early_stopping], verbose=1)

In [None]:
from sklearn.metrics import r2_score

# Evaluate on test data
y_pred = nn.predict(X_test_scaled)

# Compute R-squared
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"\n Model Performance \nR² Score: {r2:.2f} \nTest Loss: {loss:.4f} \nTest MAE: {mae:.4f}")

Part 4: Display the Models Overall Performance

In [None]:
import csv

# Save model results in CSV
model_results = [
    ["Experiment", "Hidden Layer 1", "Hidden Layer 2", "Batch Size", "Epochs", "Validation Loss", "Test MAE", "R² Score"],
    [1, 80, 50, 16, len(history.history['loss']), min(history.history['val_loss']), mae, r2]
]

# Write to CSV
with open("model_optimization_log.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(model_results)

print("\n🔍 Model results saved in 'model_optimization_log.csv'")