WCS Price percentage change vs GDP Model 

In [4]:
import pandas as pd
from pathlib import Path


#Define file paths for the uploaded datasets
gdp_file = "Cleaned Data Csv/Canada_USA_GDP_2014-2024.csv"
wcs_file = "Cleaned Data Csv/WCS_Oil_Prices_Year_Only.csv"

# Load datasets
gdp_df = pd.read_csv(gdp_file)
wcs_df = pd.read_csv(wcs_file)

# Remove unnecessary columns (such as unnamed index columns if present)
for df in [gdp_df, wcs_df]:
    df.drop(columns=[col for col in df.columns if "Unnamed" in col], inplace=True, errors='ignore')

# Standardize column names by stripping spaces and converting to lowercase
for df in [gdp_df, wcs_df]:
    df.columns = df.columns.str.strip().str.lower()

# Ensure 'year' column exists and convert to integer
for df in [gdp_df, wcs_df]:
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

# Clean WCS percent change column (remove '%' and convert to float if it exists)
if "percentchange" in wcs_df.columns:
    wcs_df["percentchange"] = wcs_df["percentchange"].str.replace("%", "", regex=True).astype(float)

# Rename columns for clarity (if they exist)
gdp_df.rename(columns={"gdp per capita (constant 2015 us$)": "gdp_per_capita"}, inplace=True)
wcs_df.rename(columns={"price": "wcs_price", "percentchange": "wcs_percent_change"}, inplace=True)

# Remove duplicates
for df in [gdp_df, wcs_df]:
    df.drop_duplicates(inplace=True)

# Drop any remaining NaN values
for df in [gdp_df, wcs_df]:
    df.dropna(inplace=True)

# Merge both datasets on the year column
merged_df = gdp_df.merge(wcs_df, on="year", how="inner")

# Save merged dataset to a CSV file for download
merged_file_path = "Cleaned Data Csv/Merged_GDP_WCS_Data.csv"
merged_df.to_csv(merged_file_path, index=False)

# Provide download link
merged_file_path


'Cleaned Data Csv/Merged_GDP_WCS_Data.csv'

In [5]:
merged_df

Unnamed: 0,country name,country code,year,gdp_per_capita,type_,value
0,Canada,CAN,2014,43643.24,WCS,65.69
1,United States,USA,2014,55817.56,WCS,65.69
2,Canada,CAN,2015,43594.19,WCS,30.43
3,United States,USA,2015,57040.21,WCS,30.43
4,Canada,CAN,2016,43551.34,WCS,17.88
5,United States,USA,2016,57658.67,WCS,17.88
6,Canada,CAN,2017,44339.39,WCS,37.19
7,United States,USA,2017,58703.14,WCS,37.19
8,Canada,CAN,2018,44907.34,WCS,42.53
9,United States,USA,2018,60127.21,WCS,42.53


In [6]:
# Rename "value" to "wcs_percent_change"
merged_df.rename(columns={"value": "wcs_percent_change"}, inplace=True)

# Display the updated column names
merged_df.head()


Unnamed: 0,country name,country code,year,gdp_per_capita,type_,wcs_percent_change
0,Canada,CAN,2014,43643.24,WCS,65.69
1,United States,USA,2014,55817.56,WCS,65.69
2,Canada,CAN,2015,43594.19,WCS,30.43
3,United States,USA,2015,57040.21,WCS,30.43
4,Canada,CAN,2016,43551.34,WCS,17.88


Part 2: Trained and intiialized the model using Python

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = merged_df.copy()
X = df.drop(columns=['wcs_percent_change'])  # Features
y = df['wcs_percent_change']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)


Training set: (16, 5) (16,)
Testing set: (4, 5) (4,)


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming merged_df is the cleaned DataFrame
df = merged_df.copy()

# Identify categorical columns
categorical_cols = ['country name', 'country code', 'type_']

# One-Hot Encode categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define Features (X) and Target (y)
X = df.drop(columns=['wcs_percent_change'])  # Features
y = df['wcs_percent_change']  # Target variable

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()

# Fit the scaler on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test set
X_test_scaled = scaler.transform(X_test)

# Display the shape of transformed datasets
print("Scaled Training set shape:", X_train_scaled.shape)
print("Scaled Testing set shape:", X_test_scaled.shape)


Scaled Training set shape: (16, 4)
Scaled Testing set shape: (4, 4)


 Evaluate the Model

In [9]:
import tensorflow as tf

# Get number of input features (columns in X_train)
number_input_features = X_train_scaled.shape[1]  # Use .shape[1] instead of len(X_train[0])

# Define the number of nodes for each hidden layer
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 50

# Initialize the neural network
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))  # Use "sigmoid" for binary classification, "linear" for regression

# Check model architecture
nn.summary()


2025-03-01 13:56:22.861682: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [11]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 1.6801
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.0000e+00 - loss: -0.8122
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step - accuracy: 0.0000e+00 - loss: -3.2469
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.0000e+00 - loss: -5.6664
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.0000e+00 - loss: -8.0409
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.0000e+00 - loss: -10.4021
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.0000e+00 - loss: -12.7876
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.0000e+00 - loss: -15.1971
Epoch 9/10
[1m1/1[0m

In [12]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - 230ms/step - accuracy: 0.0000e+00 - loss: -1.9824e+01
Loss: -19.82416534423828, Accuracy: 0.0


Part 3: Retriveing the Model using Spark and model optimization 

In [None]:
# Update package lists and install Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Download Spark 3.5.1 with Hadoop 3
!wget -q https://downloads.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz

# Extract Spark
!tar -xvf spark-3.5.1-bin-hadoop3.tgz
import os

# Set environment variables for Spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"
!pip install -q findspark
import findspark

# Initialize findspark with the correct SPARK_HOME path
findspark.init("/content/spark-3.5.1-bin-hadoop3")
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("WSC_Model").getOrCreate()

# Check Spark version
print(f"Spark Version: {spark.version}")

zsh:1: command not found: apt-get
zsh:1: command not found: apt-get
zsh:1: command not found: wget
tar: Error opening archive: Failed to open 'spark-3.5.1-bin-hadoop3.tgz'


Exception: Unable to find py4j in /content/spark-3.5.1-bin-hadoop3/python, your SPARK_HOME may not be configured correctly

In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("WSC_Model").getOrCreate()

# Check Spark version
print(f"Spark Version: {spark.version}")

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("WSC_Model").getOrCreate()

# Load data into Spark DataFrame
spark_df = spark.read.csv("merged_df.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas for TensorFlow training
df = spark_df.toPandas()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['country name', 'country code', 'type_'], drop_first=True)

# Define Features and Target
X = df.drop(columns=['value'])  # Predicting 'value' (WSC price)
y = df['value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Define the number of input features
number_input_features = X_train_scaled.shape[1]

# Initialize the model
nn = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=80, input_dim=number_input_features, activation="relu"),
    tf.keras.layers.Dense(units=50, activation="relu"),
    tf.keras.layers.Dense(units=1, activation="linear")  # Linear for regression
])

# Compile the model
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train the model and log results
history = nn.fit(X_train_scaled, y_train, 
                 validation_data=(X_test_scaled, y_test), 
                 epochs=100, batch_size=16, 
                 callbacks=[early_stopping], verbose=1)

In [None]:
from sklearn.metrics import r2_score

# Evaluate on test data
y_pred = nn.predict(X_test_scaled)

# Compute R-squared
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"\n Model Performance \nR² Score: {r2:.2f} \nTest Loss: {loss:.4f} \nTest MAE: {mae:.4f}")


Part 4: Display the Models Overall Performance

In [None]:
import csv

# Save model results in CSV
model_results = [
    ["Experiment", "Hidden Layer 1", "Hidden Layer 2", "Batch Size", "Epochs", "Validation Loss", "Test MAE", "R² Score"],
    [1, 80, 50, 16, len(history.history['loss']), min(history.history['val_loss']), mae, r2]
]

# Write to CSV
with open("model_optimization_log.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(model_results)

print("\n🔍 Model results saved in 'model_optimization_log.csv'")
