In [2]:
import pandas as pd
from pathlib import Path

GDP Data Cleaning

In [3]:
file_path = Path("Resources/gdp_data.csv") 


In [4]:
# Reload the dataset with correct header handling
df = pd.read_csv(file_path, skiprows=3)

# Drop unnecessary columns (e.g., 'Unnamed' columns if present)
df = df.dropna(axis=1, how='all')

# Rename columns for clarity
df.rename(columns={df.columns[0]: "Country Name", df.columns[1]: "Country Code",
                   df.columns[2]: "Indicator Name", df.columns[3]: "Indicator Code"}, inplace=True)

# Reshape the data to have 'Year' as a column instead of multiple year-based columns
df = df.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], 
             var_name="Year", value_name="GDP per Capita (Constant 2015 US$)")

# Convert year and GDP per capita columns to numeric types
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df["GDP per Capita (Constant 2015 US$)"] = pd.to_numeric(df["GDP per Capita (Constant 2015 US$)"], errors="coerce")

# Drop rows with missing values in key columns
df_cleaned = df.dropna(subset=["Country Name", "Year", "GDP per Capita (Constant 2015 US$)"])

# Display cleaned dataset
df_cleaned.head()




Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,GDP per Capita (Constant 2015 US$)
1,Africa Eastern and Southern,AFE,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1960,1172.316285
3,Africa Western and Central,AFW,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1960,1110.513849
9,Argentina,ARG,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1960,7397.109655
13,Australia,AUS,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1960,19905.313467
14,Austria,AUT,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1960,11894.744085


In [5]:
# Remove unnecessary columns: "Indicator Name" and "Indicator Code"
df_cleaned = df_cleaned.drop(columns=["Indicator Name", "Indicator Code"], errors="ignore")

# Reset index to remove any unwanted numbering
df_cleaned.reset_index(drop=True, inplace=True)

# Display the updated dataset
df_cleaned.head()

Unnamed: 0,Country Name,Country Code,Year,GDP per Capita (Constant 2015 US$)
0,Africa Eastern and Southern,AFE,1960,1172.316285
1,Africa Western and Central,AFW,1960,1110.513849
2,Argentina,ARG,1960,7397.109655
3,Australia,AUS,1960,19905.313467
4,Austria,AUT,1960,11894.744085


In [6]:
# Filter the dataset to include only Canada and the USA
df_filtered = df_cleaned[df_cleaned["Country Name"].isin(["Canada", "United States"])]

# Display the filtered dataset
df_filtered.head()


Unnamed: 0,Country Name,Country Code,Year,GDP per Capita (Constant 2015 US$)
18,Canada,CAN,1960,15432.471783
138,United States,USA,1960,18991.544603
162,Canada,CAN,1961,15605.523223
289,United States,USA,1961,19108.935365
313,Canada,CAN,1962,16455.753516


In [7]:
# Filter the dataset for the years 2014 to 2024
GDP = df_filtered[(df_filtered["Year"] >= 2014) & (df_filtered["Year"] <= 2024)]
GDP.reset_index(drop=True, inplace=True)  # Removes the first column

# Display the first few rows of the filtered dataset
GDP.head()



Unnamed: 0,Country Name,Country Code,Year,GDP per Capita (Constant 2015 US$)
0,Canada,CAN,2014,43643.235647
1,United States,USA,2014,55817.563247
2,Canada,CAN,2015,43594.194105
3,United States,USA,2015,57040.208214
4,Canada,CAN,2016,43551.342602


In [8]:
# Format the "GDP per Capita (Constant 2015 US$)" column to two decimal places
GDP["GDP per Capita (Constant 2015 US$)"] = GDP["GDP per Capita (Constant 2015 US$)"].round(2)

# Display the updated dataset
GDP.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GDP["GDP per Capita (Constant 2015 US$)"] = GDP["GDP per Capita (Constant 2015 US$)"].round(2)


Unnamed: 0,Country Name,Country Code,Year,GDP per Capita (Constant 2015 US$)
0,Canada,CAN,2014,43643.24
1,United States,USA,2014,55817.56
2,Canada,CAN,2015,43594.19
3,United States,USA,2015,57040.21
4,Canada,CAN,2016,43551.34


In [9]:
GDP.to_csv("Resources/Canada_USA_GDP_2014-2024.csv")

WTI Data Cleaning 

In [10]:
file_path = Path("Resources/WTI_data.csv") 

In [11]:

df_WTI = pd.read_csv(file_path)

# Convert the 'date' column into a proper datetime format
df_WTI['date'] = pd.to_datetime(df_WTI['date'])

# Fill any missing values in 'percentChange' and 'change' with 0
df_WTI[['percentChange', 'change']] = df_WTI[['percentChange', 'change']].fillna(0)

# Rename columns to lowercase and replace spaces with underscores for consistency
df_WTI.columns = df_WTI.columns.str.lower().str.replace(" ", "_")

# Keep only the data from 2014 to 2024
df_filtered = df_WTI[(df_WTI['date'] >= "2014-01-01") & (df_WTI['date'] <= "2024-12-31")]

# Select only the data for January of each year
WTI_df = df_filtered[df_filtered['date'].dt.month == 1]

# Keep only the 'date' and 'percentchange' columns
WTI_df = WTI_df[['date', 'price','percentchange']]

# Convert the percent change values into a readable percentage format (e.g., "2.34%")
WTI_df['percentchange'] = WTI_df['percentchange'].apply(lambda x: f"{x:.2f}%")

# Save the cleaned dataset to a new CSV file
WTI_df.to_csv("WTI_data_cleaned_2014_2024.csv", index=False)


WCS Cleaned Data

In [12]:
import requests
import pandas as pd

# Define the API endpoint (replace with the actual URL)
api_url = "https://api.economicdata.alberta.ca/api/data?code=1da37895-ed56-405e-81de-26231ffc6472"  # Replace with the actual API endpoint

# Fetch the data from the API
response = requests.get(api_url)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Convert the response to JSON
    
    # Convert JSON data into a Pandas DataFrame
    df = pd.DataFrame(data)

    # Rename columns if necessary (adjust based on API response)
    df.rename(columns={"date": "Date", "wcs_price": "WCS Price", "wti_price": "WTI Price"}, inplace=True)

    # Convert 'Date' to datetime format for consistency
    df['Date'] = pd.to_datetime(df['Date'])

    # Save the DataFrame to a CSV file
    df.to_csv("WCS_WTI_Oil_Prices.csv", index=False)

In [13]:
# Standardize column names (lowercase and replace spaces with underscores)
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Remove unnecessary columns ('unit' if it's not needed)
df_cleaned = df.drop(columns=['unit'])

# Sort by date to ensure chronological order
df_cleaned = df_cleaned.sort_values(by='date')

# Save the cleaned data
cleaned_file_path = "Cleaned_WCS_WTI_Oil_Prices.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Provide the download link
cleaned_file_path


'Cleaned_WCS_WTI_Oil_Prices.csv'

In [14]:
 df_cleaned.columns

Index(['date', 'type_', 'value'], dtype='object')

In [15]:
# Filter the data to keep only rows where 'type_' is 'WCS'
df_wcs = df_cleaned[df_cleaned['type_'].str.upper() == 'WCS']  # Ensure case insensitivity

# Save the filtered data
wcs_file_path = "WCS_Oil_Prices.csv"
df_wcs.to_csv(wcs_file_path, index=False)

# Provide the download link
wcs_file_path

'WCS_Oil_Prices.csv'

In [16]:
# Filter data for the years 2014 to 2024
df_wcs_filtered = df_wcs[(df_wcs['date'] >= "2014-01-01") & (df_wcs['date'] <= "2024-12-31")]

# Keep only the first month (January) of each year
df_wcs_january = df_wcs_filtered[df_wcs_filtered['date'].dt.month == 1]

# Save the filtered data
wcs_january_file_path = "WCS_Oil_Prices_2014_2024.csv"
df_wcs_january.to_csv(wcs_january_file_path, index=False)

# Provide the download link
wcs_january_file_path


'WCS_Oil_Prices_2014_2024.csv'

Date and Year Filtered WTI and WSC

In [18]:
# Load the newly uploaded CSV file
file_path = "Cleaned Data Csv/WTI_data_cleaned_2014_2024.csv"
df_wti = pd.read_csv(file_path)

# Standardize column names (lowercase and replace spaces with underscores)
df_wti.columns = df_wti.columns.str.lower().str.replace(" ", "_")

# Convert 'date' column to datetime format
df_wti['date'] = pd.to_datetime(df_wti['date'])

# Replace 'date' column with 'year' by extracting only the year
df_wti.rename(columns={'date': 'year'}, inplace=True)
df_wti['year'] = df_wti['year'].dt.year

# Save the modified data
modified_file_path = "Cleaned Data Csv/WTI_data_year_only_2014_2024.csv"
df_wti.to_csv(modified_file_path, index=False)

# Provide the download link
modified_file_path


'Cleaned Data Csv/WTI_data_year_only_2014_2024.csv'

In [19]:
df_wti

Unnamed: 0,year,price,percentchange
0,2014,97.49,-0.94%
1,2015,48.24,-9.44%
2,2016,33.62,-9.23%
3,2017,52.81,-1.69%
4,2018,64.73,7.13%
5,2019,53.79,18.45%
6,2020,51.56,-15.13%
7,2021,52.2,7.58%
8,2022,86.49,15.51%
9,2023,79.17,-1.59%


In [20]:
# Re-import necessary libraries since execution state was reset
import pandas as pd

# Load the newly uploaded CSV file
file_path = "Cleaned Data CSv/WCS_Oil_Prices_2014_2024.csv"
df_wcs = pd.read_csv(file_path)

# Standardize column names (lowercase and replace spaces with underscores)
df_wcs.columns = df_wcs.columns.str.lower().str.replace(" ", "_")

# Convert 'date' column to datetime format
df_wcs['date'] = pd.to_datetime(df_wcs['date'])

# Replace 'date' column with 'year' by extracting only the year
df_wcs.rename(columns={'date': 'year'}, inplace=True)
df_wcs['year'] = df_wcs['year'].dt.year

# Save the modified data
modified_file_path = "Cleaned Data Csv/WCS_Oil_Prices_Year_Only.csv"
df_wcs.to_csv(modified_file_path, index=False)

# Provide the download link
modified_file_path


'Cleaned Data Csv/WCS_Oil_Prices_Year_Only.csv'

In [21]:
df_wcs

Unnamed: 0,year,type_,value
0,2014,WCS,65.69
1,2015,WCS,30.43
2,2016,WCS,17.88
3,2017,WCS,37.19
4,2018,WCS,42.53
5,2019,WCS,34.3
6,2020,WCS,36.82
7,2021,WCS,40.04
8,2022,WCS,65.6
9,2023,WCS,49.94


Merged all data

GDP vs WTI

In [28]:
# Define file paths for uploaded files
gdp_file = "Cleaned Data Csv/Canada_USA_GDP_2014-2024.csv"
wti_file = "Cleaned Data Csv/WTI_data_year_only_2014_2024.csv"

# Load datasets
gdp_df = pd.read_csv(gdp_file)
wti_df = pd.read_csv(wti_file)

# Remove unnecessary columns (such as unnamed index columns if present)
gdp_df = gdp_df.loc[:, ~gdp_df.columns.str.contains('Unnamed', case=False, na=False)]
wti_df = wti_df.loc[:, ~wti_df.columns.str.contains('Unnamed', case=False, na=False)]

# Standardize column names by stripping spaces and converting to lowercase
gdp_df.columns = gdp_df.columns.str.strip().str.lower()
wti_df.columns = wti_df.columns.str.strip().str.lower()

# Ensure 'year' column exists and convert to integer
if "year" in gdp_df.columns and "year" in wti_df.columns:
    gdp_df["year"] = pd.to_numeric(gdp_df["year"], errors="coerce").astype("Int64")
    wti_df["year"] = pd.to_numeric(wti_df["year"], errors="coerce").astype("Int64")

# Clean WTI percent change column (remove '%' and convert to float)
if "percentchange" in wti_df.columns:
    wti_df["percentchange"] = (
        wti_df["percentchange"]
        .str.replace("%", "", regex=True)
        .astype(float)
    )

# Rename columns for clarity (if they exist)
gdp_df.rename(
    columns={"gdp per capita (constant 2015 us$)": "gdp_per_capita"}, inplace=True
)
wti_df.rename(
    columns={"price": "wti_price", "percentchange": "wti_percent_change"}, inplace=True
)

# Remove duplicates
gdp_df.drop_duplicates(inplace=True)
wti_df.drop_duplicates(inplace=True)

# Drop any remaining NaN values
gdp_df.dropna(inplace=True)
wti_df.dropna(inplace=True)

# Merge cleaned datasets on the year column
cleaned_merged_df = gdp_df.merge(wti_df, on="year", how="inner")

# Save cleaned dataset to a CSV file for download
cleaned_file_path = "Cleaned Data Csv/Cleaned_GDP_WTI_Data.csv"
cleaned_merged_df.to_csv(cleaned_file_path, index=False)

# Provide download link
cleaned_file_path


'Cleaned Data Csv/Cleaned_GDP_WTI_Data.csv'

In [29]:
cleaned_merged_df

Unnamed: 0,country name,country code,year,gdp_per_capita,wti_price,wti_percent_change
0,Canada,CAN,2014,43643.24,97.49,-0.94
1,United States,USA,2014,55817.56,97.49,-0.94
2,Canada,CAN,2015,43594.19,48.24,-9.44
3,United States,USA,2015,57040.21,48.24,-9.44
4,Canada,CAN,2016,43551.34,33.62,-9.23
5,United States,USA,2016,57658.67,33.62,-9.23
6,Canada,CAN,2017,44339.39,52.81,-1.69
7,United States,USA,2017,58703.14,52.81,-1.69
8,Canada,CAN,2018,44907.34,64.73,7.13
9,United States,USA,2018,60127.21,64.73,7.13


GDP vs WSC

In [30]:
# Define file paths for the uploaded datasets
gdp_file = "Cleaned Data Csv/Canada_USA_GDP_2014-2024.csv"
wcs_file = "Cleaned Data Csv/WCS_Oil_Prices_Year_Only.csv"

# Load datasets
gdp_df = pd.read_csv(gdp_file)
wcs_df = pd.read_csv(wcs_file)

# Remove unnecessary columns (such as unnamed index columns if present)
for df in [gdp_df, wcs_df]:
    df.drop(columns=[col for col in df.columns if "Unnamed" in col], inplace=True, errors='ignore')

# Standardize column names by stripping spaces and converting to lowercase
for df in [gdp_df, wcs_df]:
    df.columns = df.columns.str.strip().str.lower()

# Ensure 'year' column exists and convert to integer
for df in [gdp_df, wcs_df]:
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

# Clean WCS percent change column (remove '%' and convert to float if it exists)
if "percentchange" in wcs_df.columns:
    wcs_df["percentchange"] = wcs_df["percentchange"].str.replace("%", "", regex=True).astype(float)

# Rename columns for clarity (if they exist)
gdp_df.rename(columns={"gdp per capita (constant 2015 us$)": "gdp_per_capita"}, inplace=True)
wcs_df.rename(columns={"price": "wcs_price", "percentchange": "wcs_percent_change"}, inplace=True)

# Remove duplicates
for df in [gdp_df, wcs_df]:
    df.drop_duplicates(inplace=True)

# Drop any remaining NaN values
for df in [gdp_df, wcs_df]:
    df.dropna(inplace=True)

# Merge both datasets on the year column
merged_df = gdp_df.merge(wcs_df, on="year", how="inner")

# Save merged dataset to a CSV file for download
merged_file_path = "Cleaned Data Csv/Merged_GDP_WCS_Data.csv"
merged_df.to_csv(merged_file_path, index=False)

# Provide download link
merged_file_path


'Cleaned Data Csv/Merged_GDP_WCS_Data.csv'

In [31]:
merged_df

Unnamed: 0,country name,country code,year,gdp_per_capita,type_,value
0,Canada,CAN,2014,43643.24,WCS,65.69
1,United States,USA,2014,55817.56,WCS,65.69
2,Canada,CAN,2015,43594.19,WCS,30.43
3,United States,USA,2015,57040.21,WCS,30.43
4,Canada,CAN,2016,43551.34,WCS,17.88
5,United States,USA,2016,57658.67,WCS,17.88
6,Canada,CAN,2017,44339.39,WCS,37.19
7,United States,USA,2017,58703.14,WCS,37.19
8,Canada,CAN,2018,44907.34,WCS,42.53
9,United States,USA,2018,60127.21,WCS,42.53


WCS Price percentage change vs GDP Model

In [33]:
# Import necessary libraries
from sklearn.model_selection import train_test_split

# Define the target (y) and features (X)
# Assuming "value" represents WCS price and we want to predict whether it increased
df["wcs_price_up"] = (df["value"].diff() > 0).astype(int)  # 1 if price increased, 0 otherwise

# Select features and drop target column
X = df.drop(columns=["wcs_price_up", "country name", "country code", "type_", "value"]).values
y = df["wcs_price_up"].values

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((16, 2), (4, 2), (16,), (4,))

In [34]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler on the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Display the shapes of the scaled datasets
X_train_scaled.shape, X_test_scaled.shape


((16, 2), (4, 2))