<a href="https://colab.research.google.com/github/Rishirajbal/MacroHFT/blob/main/data_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import os

# Unzip the archive.zip into /content/data
with zipfile.ZipFile('/content/archive.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/data')

print("✅ Unzipped successfully!")


✅ Unzipped successfully!


In [None]:
import os

# Base extraction path
base_path = '/content/data'

# Walk through extracted directories
for folder, subfolders, files in os.walk(base_path):
    print(f"📁 Folder: {folder} - {len(files)} files")


📁 Folder: /content/data - 1 files
📁 Folder: /content/data/stocks - 5884 files
📁 Folder: /content/data/etfs - 2165 files


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Set base path to extracted directory
base_path = '/content/data'

etfs_path = os.path.join(base_path, 'etfs')
stocks_path = os.path.join(base_path, 'stocks')

# Load ETF files
try:
    etf_files = [os.path.join(etfs_path, file) for file in os.listdir(etfs_path) if file.endswith('.csv')]
    etf_data = pd.concat([pd.read_csv(file, low_memory=False) for file in etf_files], ignore_index=True)
    print(f"✅ Loaded {len(etf_files)} ETF files with shape: {etf_data.shape}")
except FileNotFoundError:
    print(f"⚠️ ETF folder not found at {etfs_path}")
    etf_data = pd.DataFrame()

# Load Stock files
try:
    stock_files = [os.path.join(stocks_path, file) for file in os.listdir(stocks_path) if file.endswith('.csv')]
    stock_data = pd.concat([pd.read_csv(file, low_memory=False) for file in stock_files], ignore_index=True)
    print(f"✅ Loaded {len(stock_files)} Stock files with shape: {stock_data.shape}")
except FileNotFoundError:
    print(f"⚠️ Stock folder not found at {stocks_path}")
    stock_data = pd.DataFrame()

# Merge ETF and Stock data
all_data = pd.concat([etf_data, stock_data], ignore_index=True)
print(f"✅ Combined data shape: {all_data.shape}")

# Optional: Reduce size for memory if dataset is too large
if len(all_data) > 5_000_000:
    all_data = all_data.sample(2_000_000, random_state=42)
    print("⚠️ Sampled down to 2 million rows due to memory limits")

# Split into train, test, validate
df_train, df_temp = train_test_split(all_data, test_size=0.3, random_state=42)
df_test, df_validate = train_test_split(df_temp, test_size=0.33, random_state=42)

# Save output CSVs
output_path = os.path.join(base_path, 'output_offline')
os.makedirs(output_path, exist_ok=True)

df_train.to_csv(os.path.join(output_path, 'df_train.csv'), index=False)
df_test.to_csv(os.path.join(output_path, 'df_test.csv'), index=False)
df_validate.to_csv(os.path.join(output_path, 'df_validate.csv'), index=False)

print("✅ Train, Test, and Validate CSV files created successfully!")


✅ Loaded 2165 ETF files with shape: (3954316, 7)
✅ Loaded 5884 Stock files with shape: (24197442, 7)
✅ Combined data shape: (28151758, 7)
⚠️ Sampled down to 2 million rows due to memory limits
✅ Train, Test, and Validate CSV files created successfully!


In [None]:
from google.colab import files
files.download(os.path.join(output_path, 'df_train.csv'))
files.download(os.path.join(output_path, 'df_test.csv'))
files.download(os.path.join(output_path, 'df_validate.csv'))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Example Drive path (change 'MyDrive/your_folder' as needed)
drive_output_path = '/content/drive/MyDrive/Rishi/'

os.makedirs(drive_output_path, exist_ok=True)

# Copy CSVs to Drive
for file_name in ['df_train.csv', 'df_test.csv', 'df_validate.csv']:
    src = os.path.join(output_path, file_name)
    dst = os.path.join(drive_output_path, file_name)
    os.system(f'cp {src} {dst}')
    print(f"✅ Uploaded {file_name} to Google Drive.")


✅ Uploaded df_train.csv to Google Drive.
✅ Uploaded df_test.csv to Google Drive.
✅ Uploaded df_validate.csv to Google Drive.
