In [2]:
import pandas as pd
from pathlib import Path


#Define file paths for the uploaded datasets
gdp_file = "Cleaned Data Csv/Canada_USA_GDP_2014-2024.csv"
wcs_file = "Cleaned Data Csv/WCS_Oil_Prices_Year_Only.csv"

# Load datasets
gdp_df = pd.read_csv(gdp_file)
wcs_df = pd.read_csv(wcs_file)

# Remove unnecessary columns (such as unnamed index columns if present)
for df in [gdp_df, wcs_df]:
    df.drop(columns=[col for col in df.columns if "Unnamed" in col], inplace=True, errors='ignore')

# Standardize column names by stripping spaces and converting to lowercase
for df in [gdp_df, wcs_df]:
    df.columns = df.columns.str.strip().str.lower()

# Ensure 'year' column exists and convert to integer
for df in [gdp_df, wcs_df]:
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

# Clean WCS percent change column (remove '%' and convert to float if it exists)
if "percentchange" in wcs_df.columns:
    wcs_df["percentchange"] = wcs_df["percentchange"].str.replace("%", "", regex=True).astype(float)

# Rename columns for clarity (if they exist)
gdp_df.rename(columns={"gdp per capita (constant 2015 us$)": "gdp_per_capita"}, inplace=True)
wcs_df.rename(columns={"price": "wcs_price", "percentchange": "wcs_percent_change"}, inplace=True)

# Remove duplicates
for df in [gdp_df, wcs_df]:
    df.drop_duplicates(inplace=True)

# Drop any remaining NaN values
for df in [gdp_df, wcs_df]:
    df.dropna(inplace=True)

# Merge both datasets on the year column
merged_df = gdp_df.merge(wcs_df, on="year", how="inner")

# Save merged dataset to a CSV file for download
merged_file_path = "Cleaned Data Csv/Merged_GDP_WCS_Data.csv"
merged_df.to_csv(merged_file_path, index=False)

# Provide download link
merged_file_path


'Cleaned Data Csv/Merged_GDP_WCS_Data.csv'

In [3]:
merged_df

Unnamed: 0,country name,country code,year,gdp_per_capita,type_,value
0,Canada,CAN,2014,43643.24,WCS,65.69
1,United States,USA,2014,55817.56,WCS,65.69
2,Canada,CAN,2015,43594.19,WCS,30.43
3,United States,USA,2015,57040.21,WCS,30.43
4,Canada,CAN,2016,43551.34,WCS,17.88
5,United States,USA,2016,57658.67,WCS,17.88
6,Canada,CAN,2017,44339.39,WCS,37.19
7,United States,USA,2017,58703.14,WCS,37.19
8,Canada,CAN,2018,44907.34,WCS,42.53
9,United States,USA,2018,60127.21,WCS,42.53


In [4]:
# Rename "value" to "wcs_percent_change"
merged_df.rename(columns={"value": "wcs_percent_change"}, inplace=True)

# Display the updated column names
merged_df.head()


Unnamed: 0,country name,country code,year,gdp_per_capita,type_,wcs_percent_change
0,Canada,CAN,2014,43643.24,WCS,65.69
1,United States,USA,2014,55817.56,WCS,65.69
2,Canada,CAN,2015,43594.19,WCS,30.43
3,United States,USA,2015,57040.21,WCS,30.43
4,Canada,CAN,2016,43551.34,WCS,17.88


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = merged_df.copy()
X = df.drop(columns=['wcs_percent_change'])  # Features
y = df['wcs_percent_change']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)


Training set: (16, 5) (16,)
Testing set: (4, 5) (4,)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming merged_df is the cleaned DataFrame
df = merged_df.copy()

# Identify categorical columns
categorical_cols = ['country name', 'country code', 'type_']

# One-Hot Encode categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define Features (X) and Target (y)
X = df.drop(columns=['wcs_percent_change'])  # Features
y = df['wcs_percent_change']  # Target variable

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()

# Fit the scaler on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test set
X_test_scaled = scaler.transform(X_test)

# Display the shape of transformed datasets
print("Scaled Training set shape:", X_train_scaled.shape)
print("Scaled Testing set shape:", X_test_scaled.shape)


Scaled Training set shape: (16, 4)
Scaled Testing set shape: (4, 4)


Train and Evaluate the Model

In [8]:
import tensorflow as tf

# Get number of input features (columns in X_train)
number_input_features = X_train_scaled.shape[1]  # Use .shape[1] instead of len(X_train[0])

# Define the number of nodes for each hidden layer
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 50

# Initialize the neural network
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))  # Use "sigmoid" for binary classification, "linear" for regression

# Check model architecture
nn.summary()


2025-02-28 19:45:43.755920: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [10]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: -7.1679
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step - accuracy: 0.0000e+00 - loss: -9.8347
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0000e+00 - loss: -12.5413
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.0000e+00 - loss: -15.2804
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.0000e+00 - loss: -18.0662
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.0000e+00 - loss: -20.8961
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.0000e+00 - loss: -23.7391
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.0000e+00 - loss: -26.6089
Epoch 9/10

In [11]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - 180ms/step - accuracy: 0.0000e+00 - loss: -1.4382e+03
Loss: -1438.2081298828125, Accuracy: 0.0
