In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Binance_BTCUSDT_1h.csv')
data

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
0,1.758580e+12,22/09/25 23:00,BTCUSDT,112643.25,112739.14,112592.20,112650.99,135.310950,1.524494e+07,34083
1,1.758580e+12,22/09/25 22:00,BTCUSDT,112969.99,112970.00,112594.33,112643.25,289.607150,3.264691e+07,42836
2,1.758570e+12,22/09/25 21:00,BTCUSDT,112781.87,112970.00,112602.79,112969.99,293.311560,3.307493e+07,42931
3,1.758570e+12,22/09/25 20:00,BTCUSDT,112122.90,112977.41,111975.28,112781.88,596.840050,6.707508e+07,93553
4,1.758570e+12,22/09/25 19:00,BTCUSDT,112429.12,112600.87,111936.40,112122.90,1307.373650,1.467768e+08,126232
...,...,...,...,...,...,...,...,...,...,...
70822,1.502960e+12,17/08/17 8:00,BTCUSDT,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28
70823,1.502950e+12,17/08/17 7:00,BTCUSDT,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25
70824,1.502950e+12,17/08/17 6:00,BTCUSDT,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36
70825,1.502950e+12,17/08/17 5:00,BTCUSDT,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102


In [3]:
def process_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Process the input DataFrame to ensure it has a 'Datetime' column in datetime format.

    Parameters:
        data (pd.DataFrame): Input DataFrame which may contain date columns.

    Returns:
        pd.DataFrame: Processed DataFrame with a 'Datetime' column in datetime format.
    """

    data = data.copy()

    # Find columns containing "Date"
    date_cols = [col for col in data.columns if "Date" in col]

    # Rename them to "Datetime"
    for col in date_cols:
        data = data.rename(columns={col: "Datetime"})

    # Order by Datetime descending
    data = data.iloc[::-1]
    data = data.reset_index(drop=True)

    # Remove duplicates based on all columns except Datetime
    data = data.drop_duplicates(subset=["Open","High","Low","Close","Volume BTC","Volume USDT","tradecount"]).reset_index(drop=True)

    # Convert Datetime column to datetime format, coercing errors
    data["Datetime"] = pd.to_datetime(data["Datetime"], errors="coerce", dayfirst=True)

    return data

def split_data(data: pd.DataFrame, train_size: float = 0.6) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the DataFrame into training and testing sets based on the given train size ratio.

    Parameters:
        data (pd.DataFrame): The DataFrame to be split.
        train_size (float): The proportion of the data to be used for training (default is 0.6).

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the training, testing, and validation DataFrames.
    """

    data = data.copy()

    # Calculate split indices
    train_size = int(len(data) * train_size)
    test_size = int(len(data) * 0.2)

    # Split the data
    train = data[:train_size]
    test = data[train_size:train_size + test_size]
    validation = data[train_size + test_size:]

    return train, test, validation

def normalized_dates(data: pd.DataFrame) -> pd.DataFrame:
    """
    Rebuilds a clean Datetime column assuming hourly frequency,
    starting from the first known correct timestamp.
    """

    data = data.copy()
    
    # Convert to datetime, coercing errors
    data["Datetime"] = pd.to_datetime(data["Datetime"], errors="coerce", dayfirst=True)

    # First known correct timestamp
    start = data["Datetime"].iloc[0]

    # Build a new datetime range: one row per hour
    data["Datetime"] = pd.date_range(start=start, periods=len(data), freq="h")

    return data

In [4]:
data['Date'].value_counts()

Date
00:00.0           597
28:14.8            43
02/03/25 16:00      2
05/03/25 23:00      2
05/03/25 11:00      2
                 ... 
22/12/22 7:00       1
22/12/22 6:00       1
22/12/22 5:00       1
22/12/22 4:00       1
17/08/17 4:00       1
Name: count, Length: 69683, dtype: int64

In [5]:
data[data['Date'] == '02/03/25 16:00']

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
4553,1740931000000000.0,02/03/25 16:00,BTCUSDT,87428.0,91959.99,87278.51,91200.0,11741.72701,1056043000.0,1273816
4554,1740930000000.0,02/03/25 16:00,BTCUSDT,87428.0,91959.99,87278.51,91200.0,11741.72701,1056043000.0,1273816


In [6]:
data = process_data(data)
data

  data["Datetime"] = pd.to_datetime(data["Datetime"], errors="coerce", dayfirst=True)


Unnamed: 0,Unix,Datetime,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
0,1.502940e+12,2017-08-17 04:00:00,BTCUSDT,4261.48,4313.62,4261.32,4308.83,47.181009,2.023661e+05,171
1,1.502950e+12,2017-08-17 05:00:00,BTCUSDT,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102
2,1.502950e+12,2017-08-17 06:00:00,BTCUSDT,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36
3,1.502950e+12,2017-08-17 07:00:00,BTCUSDT,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25
4,1.502960e+12,2017-08-17 08:00:00,BTCUSDT,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28
...,...,...,...,...,...,...,...,...,...,...
70316,1.758570e+12,2025-09-22 19:00:00,BTCUSDT,112429.12,112600.87,111936.40,112122.90,1307.373650,1.467768e+08,126232
70317,1.758570e+12,2025-09-22 20:00:00,BTCUSDT,112122.90,112977.41,111975.28,112781.88,596.840050,6.707508e+07,93553
70318,1.758570e+12,2025-09-22 21:00:00,BTCUSDT,112781.87,112970.00,112602.79,112969.99,293.311560,3.307493e+07,42931
70319,1.758580e+12,2025-09-22 22:00:00,BTCUSDT,112969.99,112970.00,112594.33,112643.25,289.607150,3.264691e+07,42836


In [7]:
data['Datetime'].value_counts()

Datetime
2025-10-03 00:00:00    597
2017-08-17 04:00:00      1
2023-01-05 13:00:00      1
2023-01-05 19:00:00      1
2023-01-05 18:00:00      1
                      ... 
2020-05-12 06:00:00      1
2020-05-12 07:00:00      1
2020-05-12 08:00:00      1
2020-05-12 09:00:00      1
2025-09-22 23:00:00      1
Name: count, Length: 69682, dtype: int64

In [8]:
train, test, validation = split_data(data, train_size=0.6)

In [9]:
train

Unnamed: 0,Unix,Datetime,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
0,1.502940e+12,2017-08-17 04:00:00,BTCUSDT,4261.48,4313.62,4261.32,4308.83,47.181009,2.023661e+05,171
1,1.502950e+12,2017-08-17 05:00:00,BTCUSDT,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102
2,1.502950e+12,2017-08-17 06:00:00,BTCUSDT,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36
3,1.502950e+12,2017-08-17 07:00:00,BTCUSDT,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25
4,1.502960e+12,2017-08-17 08:00:00,BTCUSDT,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28
...,...,...,...,...,...,...,...,...,...,...
42187,1.655270e+12,2022-06-15 05:00:00,BTCUSDT,21148.80,21350.00,20750.00,21226.31,8756.798933,1.844812e+08,116623
42188,1.655270e+12,2022-06-15 06:00:00,BTCUSDT,21226.31,21412.29,20965.03,21336.00,6425.583380,1.362367e+08,89340
42189,1.655280e+12,2022-06-15 07:00:00,BTCUSDT,21336.01,21445.71,20652.18,20722.92,10181.920160,2.139800e+08,109543
42190,1.655280e+12,2022-06-15 08:00:00,BTCUSDT,20722.92,20755.80,20185.00,20265.06,18440.164890,3.767810e+08,197438


In [10]:
test

Unnamed: 0,Unix,Datetime,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
42192,1.655290e+12,2022-06-15 10:00:00,BTCUSDT,20238.03,20819.37,20200.58,20573.58,13664.50954,2.804686e+08,157019
42193,1.655290e+12,2022-06-15 11:00:00,BTCUSDT,20573.59,21290.00,20394.80,21177.03,10360.69930,2.166186e+08,137961
42194,1.655290e+12,2022-06-15 12:00:00,BTCUSDT,21177.04,21580.00,21045.13,21197.61,8343.09996,1.774477e+08,121085
42195,1.655300e+12,2022-06-15 13:00:00,BTCUSDT,21197.62,21798.95,21149.82,21561.50,7815.11646,1.679680e+08,115701
42196,1.655300e+12,2022-06-15 14:00:00,BTCUSDT,21561.49,21773.34,21080.26,21215.09,6073.13627,1.298477e+08,101373
...,...,...,...,...,...,...,...,...,...,...
56251,1.705900e+12,2024-01-22 06:00:00,BTCUSDT,41138.47,41321.17,41009.16,41205.99,1603.68724,6.602762e+07,91110
56252,1.705910e+12,2024-01-22 07:00:00,BTCUSDT,41205.99,41214.00,40978.63,41043.70,1048.82053,4.310608e+07,43528
56253,1.705910e+12,2024-01-22 08:00:00,BTCUSDT,41043.70,41043.71,40761.87,40959.00,1986.47106,8.124297e+07,72690
56254,1.705910e+12,2024-01-22 09:00:00,BTCUSDT,40958.99,41047.51,40610.00,40744.85,1725.51154,7.049876e+07,69055


In [11]:
test_mod = normalized_dates(test)
test_mod

Unnamed: 0,Unix,Datetime,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
42192,1.655290e+12,2022-06-15 10:00:00,BTCUSDT,20238.03,20819.37,20200.58,20573.58,13664.50954,2.804686e+08,157019
42193,1.655290e+12,2022-06-15 11:00:00,BTCUSDT,20573.59,21290.00,20394.80,21177.03,10360.69930,2.166186e+08,137961
42194,1.655290e+12,2022-06-15 12:00:00,BTCUSDT,21177.04,21580.00,21045.13,21197.61,8343.09996,1.774477e+08,121085
42195,1.655300e+12,2022-06-15 13:00:00,BTCUSDT,21197.62,21798.95,21149.82,21561.50,7815.11646,1.679680e+08,115701
42196,1.655300e+12,2022-06-15 14:00:00,BTCUSDT,21561.49,21773.34,21080.26,21215.09,6073.13627,1.298477e+08,101373
...,...,...,...,...,...,...,...,...,...,...
56251,1.705900e+12,2024-01-22 05:00:00,BTCUSDT,41138.47,41321.17,41009.16,41205.99,1603.68724,6.602762e+07,91110
56252,1.705910e+12,2024-01-22 06:00:00,BTCUSDT,41205.99,41214.00,40978.63,41043.70,1048.82053,4.310608e+07,43528
56253,1.705910e+12,2024-01-22 07:00:00,BTCUSDT,41043.70,41043.71,40761.87,40959.00,1986.47106,8.124297e+07,72690
56254,1.705910e+12,2024-01-22 08:00:00,BTCUSDT,40958.99,41047.51,40610.00,40744.85,1725.51154,7.049876e+07,69055


In [12]:
validation

Unnamed: 0,Unix,Datetime,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
56256,1.705920e+12,2024-01-22 11:00:00,BTCUSDT,40703.90,40941.09,40670.98,40815.99,1820.31952,7.430798e+07,93816
56257,1.705920e+12,2024-01-22 12:00:00,BTCUSDT,40815.92,40922.14,40685.96,40884.97,1281.48051,5.229682e+07,51227
56258,1.705930e+12,2024-01-22 13:00:00,BTCUSDT,40884.97,41244.06,40865.60,40943.17,2170.12828,8.910386e+07,126237
56259,1.705930e+12,2024-01-22 14:00:00,BTCUSDT,40943.17,40978.69,40515.90,40694.75,2673.37652,1.087713e+08,136300
56260,1.705940e+12,2024-01-22 15:00:00,BTCUSDT,40694.76,40850.00,40350.00,40771.99,3208.29236,1.301707e+08,144464
...,...,...,...,...,...,...,...,...,...,...
70316,1.758570e+12,2025-09-22 19:00:00,BTCUSDT,112429.12,112600.87,111936.40,112122.90,1307.37365,1.467768e+08,126232
70317,1.758570e+12,2025-09-22 20:00:00,BTCUSDT,112122.90,112977.41,111975.28,112781.88,596.84005,6.707508e+07,93553
70318,1.758570e+12,2025-09-22 21:00:00,BTCUSDT,112781.87,112970.00,112602.79,112969.99,293.31156,3.307493e+07,42931
70319,1.758580e+12,2025-09-22 22:00:00,BTCUSDT,112969.99,112970.00,112594.33,112643.25,289.60715,3.264691e+07,42836


In [13]:
validation_mod = normalized_dates(validation)
validation_mod

Unnamed: 0,Unix,Datetime,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
56256,1.705920e+12,2024-01-22 11:00:00,BTCUSDT,40703.90,40941.09,40670.98,40815.99,1820.31952,7.430798e+07,93816
56257,1.705920e+12,2024-01-22 12:00:00,BTCUSDT,40815.92,40922.14,40685.96,40884.97,1281.48051,5.229682e+07,51227
56258,1.705930e+12,2024-01-22 13:00:00,BTCUSDT,40884.97,41244.06,40865.60,40943.17,2170.12828,8.910386e+07,126237
56259,1.705930e+12,2024-01-22 14:00:00,BTCUSDT,40943.17,40978.69,40515.90,40694.75,2673.37652,1.087713e+08,136300
56260,1.705940e+12,2024-01-22 15:00:00,BTCUSDT,40694.76,40850.00,40350.00,40771.99,3208.29236,1.301707e+08,144464
...,...,...,...,...,...,...,...,...,...,...
70316,1.758570e+12,2025-08-30 07:00:00,BTCUSDT,112429.12,112600.87,111936.40,112122.90,1307.37365,1.467768e+08,126232
70317,1.758570e+12,2025-08-30 08:00:00,BTCUSDT,112122.90,112977.41,111975.28,112781.88,596.84005,6.707508e+07,93553
70318,1.758570e+12,2025-08-30 09:00:00,BTCUSDT,112781.87,112970.00,112602.79,112969.99,293.31156,3.307493e+07,42931
70319,1.758580e+12,2025-08-30 10:00:00,BTCUSDT,112969.99,112970.00,112594.33,112643.25,289.60715,3.264691e+07,42836
