In [46]:
import pandas as pd
from pathlib import Path

In [47]:
# Define root and folders
project_root = Path().resolve().parents[1]  # to root directory
raw_dir = project_root / 'data' / 'raw'
processed_dir_to_save = project_root / 'data' / 'processed'
processed_dir_to_save.mkdir(parents=True, exist_ok=True)

SAVE_PATH = processed_dir_to_save / 'merged_prices_raw.csv'

In [48]:

# Load raw CSVs
btc = pd.read_csv(raw_dir / 'bitcoin_historical_dataset_raw.csv')
gold = pd.read_csv(raw_dir / 'gold_prices_raw.csv', parse_dates=['Date'])
sp500 = pd.read_csv(raw_dir / 'sp500_prices_raw.csv', parse_dates=['Date'])

In [49]:
btc.head()

Unnamed: 0,Start,End,Open,High,Low,Close,Volume,Market Cap
0,01/06/2025,,,,,104709.0,,
1,01/05/2025,,,,,94237.0,,
2,01/04/2025,,,,,82461.0,,
3,01/03/2025,08/03/2025,84307.46,94808.02,81942.81,86832.99,87573800000.0,1740000000000.0
4,01/02/2025,01/03/2025,102318.0,102760.0,78534.47,84501.01,63094370000.0,1900000000000.0


In [50]:
# First, parse the dates correctly specifying the current format
btc['Start'] = pd.to_datetime(btc['Start'], format='%d/%m/%Y')
# Then convert to string in the new format YYYY-mm-dd if needed
btc['Start'] = btc['Start'].dt.strftime('%Y-%m-%d')

In [51]:
btc.rename({'Start':'Date', 'Close':'BTC_PRICE_$'}, axis=1, inplace=True)

In [52]:
btc = btc[['Date', 'BTC_PRICE_$']]

In [53]:
btc.tail()

Unnamed: 0,Date,BTC_PRICE_$
181,2010-05-01,0.03
182,2010-04-01,0.02
183,2010-03-01,0.01
184,2010-02-01,0.005
185,2010-01-01,0.0


In [54]:
gold.head()

Unnamed: 0,Date,Gold_Price_$
0,2010-01-01,1123.6
1,2010-02-01,1159.6
2,2010-03-01,1154.3
3,2010-04-01,1223.2
4,2010-05-01,1260.3


In [55]:
btc['Date'] = pd.to_datetime(btc['Date'])
gold['Date'] = pd.to_datetime(gold['Date'])
sp500['Date'] = pd.to_datetime(sp500['Date'])

In [56]:
# Merge on 'date'
merged = pd.merge(btc, gold, on='Date', how='outer')
merged = pd.merge(merged, sp500, on='Date', how='outer')

In [57]:
merged

Unnamed: 0,Date,BTC_PRICE_$,Gold_Price_$,S&P500_$
0,2010-01-01,0.000,1123.6,1073.9
1,2010-02-01,0.005,1159.6,1104.5
2,2010-03-01,0.010,1154.3,1169.4
3,2010-04-01,0.020,1223.2,1186.7
4,2010-05-01,0.030,1260.3,1089.4
...,...,...,...,...
181,2025-02-01,84501.010,2791.0,5954.5
182,2025-03-01,86832.990,3053.9,5611.9
183,2025-04-01,82461.000,3220.3,5569.1
184,2025-05-01,94237.000,3218.2,5911.7


In [58]:
# Sort by date
merged_df = merged.sort_values('Date').reset_index(drop=True)

In [59]:
merged_df.tail(20)

Unnamed: 0,Date,BTC_PRICE_$,Gold_Price_$,S&P500_$
166,2023-11-01,37715.02,2001.3,4567.8
167,2023-12-01,42224.09,2026.7,4769.8
168,2024-01-01,42611.24,1997.0,4845.6
169,2024-02-01,61256.01,2006.6,5096.3
170,2024-03-01,71216.98,2180.4,5254.4
171,2024-04-01,60761.95,2246.1,5035.7
172,2024-05-01,67576.21,2282.2,5277.5
173,2024-06-01,62849.53,2279.0,5460.5
174,2024-07-01,64665.98,2402.0,5522.3
175,2024-08-01,58965.98,2451.8,5648.4


In [60]:
# Save 
merged.to_csv(SAVE_PATH, index=False)