<a href="https://colab.research.google.com/github/Milsy18/m18-model/blob/main/01_data_ingestion_and_alignment_ipynb1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
%%bash
cd /content/m18-model   # adjust if your project lives elsewhere
rm -rf .git             # delete any existing Git history
echo "✅ Removed any existing .git folder"


✅ Removed any existing .git folder


In [24]:
# Cell 1: clone and folder setup
import os
from pathlib import Path

# 1. Clone your GitHub repo (or pull latest if it already exists)
REPO_URL   = "https://github.com/YourUserName/m18-model.git"
PROJECT_ROOT = Path("/content/m18-model")

if not PROJECT_ROOT.exists():
    # first-time clone
    os.system(f"git clone {REPO_URL} {PROJECT_ROOT}")
else:
    # subsequent runs: update
    os.chdir(PROJECT_ROOT)
    os.system("git pull")

# 2. Create subfolders if they aren’t already present
for sub in ["data/raw", "data/processed", "modules", "outputs"]:
    (PROJECT_ROOT / sub).mkdir(parents=True, exist_ok=True)

# 3. Change working directory so all relative paths resolve
os.chdir(PROJECT_ROOT)
print("Working in", PROJECT_ROOT)


Working in /content/m18-model


In [25]:
# Cell 2: Data ingestion, epoch‐to‐datetime conversion & alignment
import pandas as pd
from pathlib import Path

# 1) Paths
RAW_DIR  = Path('/content')
PROC_DIR = Path('data/processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)

# 2) Exact filenames as you uploaded them
file_map = {
    'BTC_D'  : 'CRYPTOCAP_BTC.D, 1D (1).csv',
    'TOTAL'  : 'CRYPTOCAP_TOTAL, 1D (1).csv',
    'TOTAL3' : 'CRYPTOCAP_TOTAL3, 1D (1).csv',
    'USDT_D' : 'CRYPTOCAP_USDT.D, 1D.csv',
}

market_dfs = {}
for name, fname in file_map.items():
    path = RAW_DIR / fname
    print(f"Loading {name} from {path}")
    df = pd.read_csv(path)

    #  └─ convert UNIX seconds → pandas DateTime index
    df['Date'] = pd.to_datetime(df['time'], unit='s')
    df.set_index('Date', inplace=True)
    df.drop(columns=['time'], inplace=True)

    print(f"  ↳ {name}: dates {df.index.min().date()} → {df.index.max().date()}, shape {df.shape}")
    market_dfs[name] = df

# 3) Align all four on their intersection of dates
common_idx = market_dfs['BTC_D'].index
for df in market_dfs.values():
    common_idx = common_idx.intersection(df.index)
print(f"\nAligned date‐index length: {len(common_idx)}")

for name in market_dfs:
    market_dfs[name] = market_dfs[name].loc[common_idx]

# 4) Concatenate and save
aligned = pd.concat(market_dfs, axis=1)
aligned.to_csv(PROC_DIR / 'market_regimes_aligned.csv', index=True)
print(f"Saved aligned regimes to → {PROC_DIR/'market_regimes_aligned.csv'}")


Loading BTC_D from /content/CRYPTOCAP_BTC.D, 1D (1).csv
  ↳ BTC_D: dates 2020-01-01 → 2025-06-07, shape (1985, 5)
Loading TOTAL from /content/CRYPTOCAP_TOTAL, 1D (1).csv
  ↳ TOTAL: dates 2020-01-01 → 2025-06-07, shape (1985, 5)
Loading TOTAL3 from /content/CRYPTOCAP_TOTAL3, 1D (1).csv
  ↳ TOTAL3: dates 2020-01-01 → 2025-06-07, shape (1985, 5)
Loading USDT_D from /content/CRYPTOCAP_USDT.D, 1D.csv
  ↳ USDT_D: dates 2020-01-01 → 2025-06-07, shape (1985, 5)

Aligned date‐index length: 1985
Saved aligned regimes to → data/processed/market_regimes_aligned.csv


In [26]:
%%bash
# Configure your author name & email for Git
git config --global user.name "Misly18"
git config --global user.email "millarmike1@gmail.com"


In [27]:
# Properly read the two header rows into a MultiIndex
import pandas as pd

df = pd.read_csv(
    'data/processed/market_regimes_aligned.csv',
    header=[0,1],    # read rows 0 & 1 as a two‐level header
    index_col=0,     # first column is the date index
    parse_dates=True # parse that index as datetime
)
df.head()


Unnamed: 0_level_0,BTC_D,BTC_D,BTC_D,BTC_D,BTC_D,TOTAL,TOTAL,TOTAL,TOTAL,TOTAL,TOTAL3,TOTAL3,TOTAL3,TOTAL3,TOTAL3,USDT_D,USDT_D,USDT_D,USDT_D,USDT_D
Unnamed: 0_level_1,open,high,low,close,Volume,open,high,low,close,Volume,open,high,low,close,Volume,open,high,low,close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2020-01-01,70.065894,70.144563,69.741007,69.921538,18580420000.0,185455200000.0,188054900000.0,184774100000.0,186105200000.0,59771140000.0,41487590000.0,42409170000.0,41185800000.0,41762080000.0,33205080000.0,2.208425,2.212275,2.182023,2.201859,21562310000.0
2020-01-02,69.956957,70.129144,69.812534,69.920361,20813800000.0,186030000000.0,186711500000.0,178481300000.0,180152500000.0,65245420000.0,41678340000.0,42001540000.0,39788050000.0,40343570000.0,36349740000.0,2.203964,2.285624,2.196909,2.271307,24317600000.0
2020-01-03,69.933572,70.185528,69.737601,69.95551,28024470000.0,180139600000.0,192431000000.0,177298200000.0,190146100000.0,86378430000.0,40323080000.0,43350320000.0,39547490000.0,42474510000.0,47916000000.0,2.273203,2.301038,2.134814,2.15593,32335310000.0
2020-01-04,69.93253,70.091216,69.715515,69.875262,18418750000.0,190267600000.0,192421000000.0,187872800000.0,190901500000.0,60383070000.0,42564720000.0,43452860000.0,41824550000.0,42880960000.0,34523230000.0,2.155914,2.180049,2.135203,2.148195,21543380000.0
2020-01-05,69.832141,69.99776,69.569826,69.704416,19708560000.0,191040700000.0,195343100000.0,189570300000.0,191363400000.0,65228880000.0,42981220000.0,44254660000.0,42391190000.0,43198840000.0,37999690000.0,2.145383,2.162024,2.104587,2.142817,24122230000.0
