In [1]:
import pandas as pd
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

DATA_DIR = PROJECT_ROOT/"data"/"raw"
DATA_DIR.mkdir(parents=True, exist_ok=True)

from src.data_utils import(
    get_last_one_year_dates,
    generate_trading_timestamps,
    generate_raw_spot_data,
    generate_raw_futures_data,
    generate_raw_options_data
)

#### Date range

In [2]:
start_date, end_date = get_last_one_year_dates()
print(f"Date range:{start_date} to {end_date}")

Date range:16-01-2025 to 16-01-2026


#### Trading timestamps generation

In [3]:
timestamps = generate_trading_timestamps(start_date, end_date)

print("Total 5 min candles: ", len(timestamps))
timestamps[:5]

Total 5 min candles:  19912


DatetimeIndex(['2025-01-16 09:15:00', '2025-01-16 09:20:00',
               '2025-01-16 09:25:00', '2025-01-16 09:30:00',
               '2025-01-16 09:35:00'],
              dtype='datetime64[ns]', freq=None)

#### Spot raw data

In [4]:
spot_df = generate_raw_spot_data(timestamps)
spot_df.to_csv(DATA_DIR/"nifty_spot_raw.csv", index=False)

spot_df.head()

Unnamed: 0,timestamp,spot_open,spot_high,spot_low,spot_close,spot_volume
0,2025-01-16 09:15:00,18016.461061,18035.198148,17986.333085,18016.461061,420333.0
1,2025-01-16 09:20:00,18012.338002,18038.86125,17995.635816,18012.338002,282041.0
2,2025-01-16 09:25:00,18033.710411,18055.444272,18016.808443,18033.710411,338254.0
3,2025-01-16 09:30:00,18083.578491,18118.75445,18053.506466,18083.578491,116076.0
4,2025-01-16 09:35:00,18076.319824,18094.912659,18063.79529,18076.319824,163125.0


#### futures raw data

In [5]:
futures_df = generate_raw_futures_data(spot_df)
futures_df.to_csv(DATA_DIR/"nifty_futures.csv", index=False)

futures_df.head()

Unnamed: 0,timestamp,contract,fut_open,fut_high,fut_low,fut_close,fut_volume,fut_open_interest
0,2025-01-16 09:15:00,NIFTY_CURR_FUT,18030.567622,18049.31938,18000.416056,18030.567622,301662,6131720.0
1,2025-01-16 09:20:00,NIFTY_CURR_FUT,18064.344535,18090.944363,18047.594125,18064.344535,988249,6995576.0
2,2025-01-16 09:25:00,NIFTY_CURR_FUT,18081.221769,18103.012889,18064.275271,18081.221769,283978,4862033.0
3,2025-01-16 09:30:00,NIFTY_CURR_FUT,18106.746245,18141.96727,18076.635694,18106.746245,827518,6049018.0
4,2025-01-16 09:35:00,NIFTY_CURR_FUT,18112.842409,18131.472811,18100.29257,18112.842409,270097,3522345.0


#### options raw data

In [7]:
options_df = generate_raw_options_data(spot_df)
options_df.to_csv(DATA_DIR/"nifty_options_raw.csv", index=False)
options_df.head()

Unnamed: 0,timestamp,opt_expiry,opt_strike,opt_type,opt_ltp,opt_iv,opt_open_interest,opt_volume
0,2025-01-16 09:15:00,2025-01-23 09:15:00,17900,CE,58.23053,0.175797,181180,10464
1,2025-01-16 09:15:00,2025-01-23 09:15:00,17900,PE,58.23053,0.195797,266711,2801
2,2025-01-16 09:15:00,2025-01-23 09:15:00,17950,CE,33.23053,0.175797,260184,12167
3,2025-01-16 09:15:00,2025-01-23 09:15:00,17950,PE,33.23053,0.195797,70391,4702
4,2025-01-16 09:15:00,2025-01-23 09:15:00,18000,CE,8.23053,0.175797,120617,11806


In [9]:
print("SPOT missing values:\n", spot_df.isna().sum(), "\n")
print("FUTURES missing values:\n", futures_df.isna().sum(), "\n")
print("OPTIONS missing values:\n", options_df.isna().sum(), "\n")

print("Options rows per timestamp (should be ~10):")
options_df.groupby("timestamp").size().describe()


SPOT missing values:
 timestamp       0
spot_open      99
spot_high      99
spot_low       99
spot_close     99
spot_volume    99
dtype: int64 

FUTURES missing values:
 timestamp              0
contract               0
fut_open              99
fut_high              99
fut_low               99
fut_close             99
fut_volume             0
fut_open_interest    995
dtype: int64 

OPTIONS missing values:
 timestamp               0
opt_expiry              0
opt_strike              0
opt_type                0
opt_ltp                 0
opt_iv               9802
opt_open_interest       0
opt_volume              0
dtype: int64 

Options rows per timestamp (should be ~10):


count    19813.0
mean        10.0
std          0.0
min         10.0
25%         10.0
50%         10.0
75%         10.0
max         10.0
dtype: float64