In [2]:
import pandas as pd
import glob
import os

from pathlib import Path

try:
    from utils.paths import RAW_DIR, PROCESSED_DIR
    print(f"Paths imported successfully.")
    print(f"Raw data source: {RAW_DIR}")
    print(f"Processed data target: {PROCESSED_DIR}")
except ImportError:
    print("Error: Could not import paths from utils.")

Paths imported successfully.
Raw data source: /home/narodom.y@FUSION.LAB/research/data/raw
Processed data target: /home/narodom.y@FUSION.LAB/research/data/processed


In [7]:
def combine_price_data(asset_class="equity", interval="1d"):
    print(f"\n--- üöÄ Starting processing for: {asset_class} / {interval} ---")

    # 1. Define source path.
    source_dir = RAW_DIR / asset_class / interval
    if not source_dir.exists():
        print(f"‚ùå Error: Directory not found: {source_dir}")
        return
    # 2. search .csv in the directory
    #    such as .../data/raw/equity/1d/AAPL.csv
    csv_files = list(source_dir.glob("*.csv"))
    
    if not csv_files:
        print(f"‚ùå No .csv files found in: {source_dir}")
        return

    # 3. read files loop and allocate them to list
    all_dataframes = []

    for file_path in csv_files:
        # 4. Ticker file
        # file_path.stem will be obtained non .csv (‡πÄ‡∏ä‡πà‡∏ô 'AAPL' ‡∏´‡∏£‡∏∑‡∏≠ 'ES_F')
        ticker_name = file_path.stem

        try:
            df = pd.read_csv(file_path)

            # 5. The tickers will be Features
            df["ticker"] = ticker_name

            all_dataframes.append(df)
        except pd.errors.EmptyDataError:
            print(f"Warning: Skipping empty file: {file_path.name}")
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")

    # 6. If preprocess data be non-existent
    if not all_dataframes:
        print("No data was processed.")
        return
        
    combined_df = pd.concat(all_dataframes, ignore_index=True)

    # 7. Combined Date/Time
    if 'Date' in combined_df.columns:
        combined_df['Date'] = pd.to_datetime(combined_df['Date'])
    elif 'Datetime' in combined_df.columns:
        combined_df['Datetime'] = pd.to_datetime(combined_df['Datetime'])

    print("Pivoting data to 'Wide Format'...")
    
    if 'Close' in combined_df.columns:
        combined_df.rename(columns={'Close': 'Price'}, inplace=True)
        print("Renamed 'Close' column to 'Price'.")

    
    # 8. Save preprocess file
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    # .parquet is recommended because it's column-oriented data file format -> faster and high performance for the big data
    save_path = PROCESSED_DIR / f"{asset_class}_{interval}_combined.parquet"
    
    combined_df.to_parquet(save_path, index=False, engine='fastparquet')
    
    print(f"‚úÖ Successfully combined and saved to: {save_path}")
    print(f"Final DataFrame shape: {combined_df.shape}")
    print("--- Process finished ---")
    
    return combined_df

In [8]:
df_equity_daily = combine_price_data(asset_class="equity", interval="1d")
df_equity_daily


--- üöÄ Starting processing for: equity / 1d ---
‚úÖ Successfully combined and saved to: /home/narodom.y@FUSION.LAB/research/data/processed/equity_1d_combined.parquet
Final DataFrame shape: (5856, 7)
--- Process finished ---


Unnamed: 0,Price,Close,High,Low,Open,Volume,ticker
0,Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
1,Date,,,,,,AAPL
2,2020-01-02,72.53849792480469,72.59887623049032,71.29228881100529,71.54587474672552,135480400,AAPL
3,2020-01-03,71.83332061767578,72.59408626701142,71.60871495013537,71.76569716818824,146322800,AAPL
4,2020-01-06,72.40567779541016,72.44432080433776,70.7030121336898,70.95418800651902,118387200,AAPL
...,...,...,...,...,...,...,...
5851,2025-10-20,182.63999938964844,185.1999969482422,181.72999572753906,183.1300048828125,128544700,NVDA
5852,2025-10-21,181.16000366210938,182.7899932861328,179.8000030517578,182.7899932861328,124240200,NVDA
5853,2025-10-22,180.27999877929688,183.44000244140625,176.75999450683594,181.13999938964844,162249600,NVDA
5854,2025-10-23,182.16000366210938,183.02999877929688,179.7899932861328,180.4199981689453,111363700,NVDA


In [12]:
asset_class="equity"
interval="1d"

source_dir = RAW_DIR / asset_class / interval
if not source_dir.exists():
    print(f"‚ùå Error: Directory not found: {source_dir}")

# 2. search .csv in the directory
#    such as .../data/raw/equity/1d/AAPL.csv
csv_files = list(source_dir.glob("*.csv"))
csv_files

[PosixPath('/home/narodom.y@FUSION.LAB/research/data/raw/equity/1d/AAPL.csv'),
 PosixPath('/home/narodom.y@FUSION.LAB/research/data/raw/equity/1d/GOOG.csv'),
 PosixPath('/home/narodom.y@FUSION.LAB/research/data/raw/equity/1d/MSFT.csv'),
 PosixPath('/home/narodom.y@FUSION.LAB/research/data/raw/equity/1d/NVDA.csv')]

In [28]:
import yfinance as yf

from datetime import datetime

tickers = ["AAPL", "GOOG"]
asset_class="equity"  # equity, futures, options
interval="1d"         # Timeframe
start_date="2025-01-01" 
end_date=datetime.now().strftime('%Y-%m-%d')

ticker = tickers[0]
data = yf.download(tickers, start=start_date, end=end_date, interval=interval)
data

  data = yf.download(tickers, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  2 of 2 completed


Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,AAPL,GOOG,AAPL,GOOG,AAPL,GOOG,AAPL,GOOG,AAPL,GOOG
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2025-01-02,242.987427,190.014633,248.218856,192.576329,240.964609,188.100833,248.049444,190.866869,55740700,17545200
2025-01-03,242.499161,192.506561,243.316252,193.872134,241.034359,190.732309,242.499161,192.102870,40244100,12875000
2025-01-06,244.133347,197.320984,246.455106,198.915810,242.339711,194.430336,243.445785,194.520042,45045600,19483300
2025-01-07,241.353226,196.075012,244.681407,201.487476,240.496267,195.307494,242.120491,197.629974,40856000,16966800
2025-01-08,241.841461,194.759262,242.847898,197.001999,239.200841,193.124557,241.064222,193.323908,37628900,14335300
...,...,...,...,...,...,...,...,...,...,...
2025-10-22,258.450012,252.529999,262.850006,257.179993,255.429993,250.451004,262.649994,255.009995,45015300,19650200
2025-10-23,259.579987,253.729996,260.619995,255.860001,258.010010,252.768005,259.940002,253.699997,32754900,13241300
2025-10-24,262.820007,260.510010,264.130005,262.510010,259.179993,256.100006,261.190002,257.299988,38253700,18406000
2025-10-27,268.809998,269.929993,269.119995,270.799988,264.649994,264.940002,264.880005,265.359985,44888200,22987700


In [27]:
data.iloc[0]

Price   Ticker
Close   AAPL      2.429874e+02
High    AAPL      2.482189e+02
Low     AAPL      2.409646e+02
Open    AAPL      2.480494e+02
Volume  AAPL      5.574070e+07
Name: 2025-01-02 00:00:00, dtype: float64