In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Read data 
gpr = Path("data_gpr_export.xls")
gpr_df = pd.read_excel(gpr)

In [3]:
# Display DataFrame
gpr_df.head()

Unnamed: 0,month,GPR,GPRT,GPRA,GPRH,GPRHT,GPRHA,SHARE_GPR,N10,SHARE_GPRH,...,GPRHC_TUN,GPRHC_TUR,GPRHC_TWN,GPRHC_UKR,GPRHC_USA,GPRHC_VEN,GPRHC_VNM,GPRHC_ZAF,var_name,var_label
0,1900-01-01,,,,87.927849,64.717491,110.453522,,,3.171932,...,0.0,0.03884,0.0,0.0,2.718799,0.051787,0.012947,1.152253,month,Date (year/month)
1,1900-02-01,,,,86.56649,71.936844,96.250488,,,3.122822,...,0.0,0.125471,0.0,0.0,2.732469,0.027882,0.0,1.143176,GPR,Recent GPR (Index: 1985:2019=100)
2,1900-03-01,,,,72.140701,57.475853,84.499428,,,2.602422,...,0.0,0.180366,0.0,0.0,2.151507,0.025767,0.0,0.86318,GPRT,Recent GPR Threats (Index: 1985:2019=100)
3,1900-04-01,,,,54.419449,37.326603,65.858208,,,1.963141,...,0.0,0.066774,0.0,0.0,1.776175,0.0,0.0,0.641026,GPRA,Recent GPR Acts (Index: 1985:2019=100)
4,1900-05-01,,,,64.405197,48.200008,74.373955,,,2.32337,...,0.0,0.081522,0.0,0.0,1.970109,0.013587,0.0,0.788043,GPRH,Historical GPR (Index: 1900:2019=100)


In [4]:
gpr_df = gpr_df[["month","GPR","GPRT","GPRA","GPRC_CAN","GPRC_USA"]]
gpr_df.head()

Unnamed: 0,month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA
0,1900-01-01,,,,,
1,1900-02-01,,,,,
2,1900-03-01,,,,,
3,1900-04-01,,,,,
4,1900-05-01,,,,,


In [5]:
# Check for null values
gpr_df.isnull().values.any()

True

In [6]:
# Convert 'month' column to datetime format
gpr_df["month"] = pd.to_datetime(gpr_df["month"])

# Filter out rows where 'month' is before 1985-01-01
gpr_df = gpr_df[gpr_df["month"] >= "1986-01-01"]

# Reset index (optional, to start from 0)
gpr_df.reset_index(drop=True, inplace=True)

gpr_df.head()

Unnamed: 0,month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA
0,1986-01-01,135.360748,137.66832,166.022629,0.555141,3.380562
1,1986-02-01,98.750031,84.021179,114.815605,0.288965,2.302691
2,1986-03-01,98.676674,85.129463,117.561165,0.237102,2.428256
3,1986-04-01,148.313858,142.493393,182.872787,0.630758,4.075027
4,1986-05-01,117.389244,126.68499,114.202209,0.571849,2.916428


In [7]:
gpr_df.count()

month       469
GPR         469
GPRT        469
GPRA        469
GPRC_CAN    469
GPRC_USA    469
dtype: int64

In [8]:
# Check for null values
gpr_df.isnull().values.any()

False

In [9]:
type(gpr_df)

pandas.core.frame.DataFrame

In [10]:
gpr_df[["GPR","GPRT","GPRA","GPRC_CAN","GPRC_USA"]] = gpr_df[["GPR","GPRT","GPRA","GPRC_CAN","GPRC_USA"]].round(2)
gpr_df.head()

Unnamed: 0,month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA
0,1986-01-01,135.36,137.67,166.02,0.56,3.38
1,1986-02-01,98.75,84.02,114.82,0.29,2.3
2,1986-03-01,98.68,85.13,117.56,0.24,2.43
3,1986-04-01,148.31,142.49,182.87,0.63,4.08
4,1986-05-01,117.39,126.68,114.2,0.57,2.92


In [11]:
# Check for null values
gpr_df.isnull().values.any()

False

In [12]:
# Check data types
gpr_df.dtypes

month       datetime64[ns]
GPR                float64
GPRT               float64
GPRA               float64
GPRC_CAN           float64
GPRC_USA           float64
dtype: object

In [13]:
# Split 'month' column into 'year' and 'month' columns
gpr_df['Year'] = gpr_df['month'].dt.year
gpr_df['Month'] = gpr_df['month'].dt.month
gpr_df.drop('month', axis=1, inplace=True)

# Rearrange columns
cols = ['Year', 'Month'] + [col for col in gpr_df if col not in ['Year', 'Month']]
gpr_df = gpr_df[cols]
gpr_df.head()

Unnamed: 0,Year,Month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA
0,1986,1,135.36,137.67,166.02,0.56,3.38
1,1986,2,98.75,84.02,114.82,0.29,2.3
2,1986,3,98.68,85.13,117.56,0.24,2.43
3,1986,4,148.31,142.49,182.87,0.63,4.08
4,1986,5,117.39,126.68,114.2,0.57,2.92


In [15]:
# Read oil price data from CSV file
oil_price_df = pd.read_csv('crude_oil_prices.csv')

# Display the first few rows of the DataFrame to confirm it was read correctly
oil_price_df.head()

Unnamed: 0,Year,Month,WCS,WTI
0,1986,1,,22.93
1,1986,2,,15.46
2,1986,3,,12.61
3,1986,4,,12.84
4,1986,5,,15.38


In [16]:
# Combine gpr_df and oil_price_df DataFrames int one DataFrame
combined_df = pd.merge(gpr_df, oil_price_df, on=['Year', 'Month'], how='inner')
combined_df.drop('WCS', axis=1, inplace=True)

combined_df.head()

Unnamed: 0,Year,Month,GPR,GPRT,GPRA,GPRC_CAN,GPRC_USA,WTI
0,1986,1,135.36,137.67,166.02,0.56,3.38,22.93
1,1986,2,98.75,84.02,114.82,0.29,2.3,15.46
2,1986,3,98.68,85.13,117.56,0.24,2.43,12.61
3,1986,4,148.31,142.49,182.87,0.63,4.08,12.84
4,1986,5,117.39,126.68,114.2,0.57,2.92,15.38


In [None]:
# Export the combined DataFrame to a CSV file
combined_df.to_csv('cleaned_wti_data.csv', index=False)