In [1]:
import pandas as pd
import os

# Convert parquet files to CSV to prevent compatibility issues

In [199]:
os.makedirs("data/stock_prices/1h", exist_ok=True)
os.makedirs("data/stock_prices/1d_adj", exist_ok=True)

## Main stock price data

In [2]:
nvda_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/NVDA.parquet")

In [3]:
nvda_1h

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 04:00:00,5.9003,5.9481,5.9003,5.9389,60000
2020-01-02 05:00:00,5.9389,5.9486,5.9372,5.9449,29920
2020-01-02 06:00:00,5.9486,5.9501,5.9436,5.9436,37800
2020-01-02 07:00:00,5.9329,5.9481,5.9242,5.9464,614480
2020-01-02 08:00:00,5.9456,5.9625,5.9247,5.9556,1660520
...,...,...,...,...,...
2025-06-30 15:00:00,157.7411,158.6510,157.6611,157.8611,26770205
2025-06-30 16:00:00,157.8611,168.1440,147.8677,157.7811,72869000
2025-06-30 17:00:00,157.7811,168.1440,147.0664,157.7811,223935
2025-06-30 18:00:00,157.7611,157.7911,157.4911,157.5113,219009


# Check the daylight saving time change period
- Data from Alpha Vantage always have 4:00 AM as start of the trading day and end at 8:00 PM (7:00 candle close) regardless of DST.
- DST starts on second Sunday of March and ends on first Sunday of November in New York timezone.
- We need to make sure that everything is aligned properly during these periods.

## First, check how pandas handles the timezone conversion during DST change periods.
According to calendar in 2022, DST starts on March 13 (second Sunday of March)

In [4]:
nvda_1h["2022-03-11 17:00":"2022-03-14 06:00"]

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-11 17:00:00,22.0148,22.0578,21.9859,22.0578,182620
2022-03-11 18:00:00,22.0508,22.0568,21.9989,22.0059,155050
2022-03-11 19:00:00,22.0078,22.0977,21.9979,22.0628,326660
2022-03-14 04:00:00,22.0628,22.1626,21.912,22.0128,60150
2022-03-14 05:00:00,21.9679,22.3423,21.9679,22.3423,106900
2022-03-14 06:00:00,22.3423,22.3423,22.1646,22.1935,93160


In pandas, `.tz_localize()` handles the conversion correctly by assigning the appropriate UTC offset based on the date, which it miraculously knows about DST rules. As a result, the timestamps before March 13 have an offset of -5 hours (EST), while those on and after March 13 have an offset of -4 hours (EDT). This ensures that the local times are accurately represented in New York timezone, taking into account the DST change.

In [5]:
nvda_1h["2022-03-11 17:00":"2022-03-14 06:00"].tz_localize("America/New_York")

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-11 17:00:00-05:00,22.0148,22.0578,21.9859,22.0578,182620
2022-03-11 18:00:00-05:00,22.0508,22.0568,21.9989,22.0059,155050
2022-03-11 19:00:00-05:00,22.0078,22.0977,21.9979,22.0628,326660
2022-03-14 04:00:00-04:00,22.0628,22.1626,21.912,22.0128,60150
2022-03-14 05:00:00-04:00,21.9679,22.3423,21.9679,22.3423,106900
2022-03-14 06:00:00-04:00,22.3423,22.3423,22.1646,22.1935,93160


Below is how it looks like after conversion to UTC-0 (for universal consistency across datasets)
- On March 11, 2022, 4:00 AM EST becomes March 11, 2022, 9:00 AM UTC.
- On March 14, 2022, 4:00 AM EDT becomes March 14, 2022, 8:00 AM UTC.

In [6]:
nvda_1h["2022-03-11 0:00":"2022-03-15 10:00"].tz_localize("America/New_York").tz_convert("UTC")

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-11 09:00:00+00:00,22.5619,22.8604,22.5619,22.8424,85110
2022-03-11 10:00:00+00:00,22.8584,23.0211,22.8584,23.0112,179110
2022-03-11 11:00:00+00:00,22.9892,23.3506,22.8614,23.2577,300900
2022-03-11 12:00:00+00:00,23.2038,23.3106,23.091,23.1539,671420
2022-03-11 13:00:00+00:00,23.1438,23.3236,22.6747,23.1267,1609540
2022-03-11 14:00:00+00:00,23.1319,23.1908,22.6538,22.7157,36707820
2022-03-11 15:00:00+00:00,22.7196,22.8075,22.503,22.6697,44332250
2022-03-11 16:00:00+00:00,22.6773,22.6777,22.1366,22.3712,39164120
2022-03-11 17:00:00+00:00,22.3608,22.497,22.2255,22.4391,25340660
2022-03-11 18:00:00+00:00,22.4361,22.6198,22.3922,22.4062,27002530


Now we can finalize the data by converting to UTC timezone and saving to CSV files.

In [7]:
nvda_1h = nvda_1h.tz_localize("America/New_York").tz_convert("UTC")
nvda_1h

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 09:00:00+00:00,5.9003,5.9481,5.9003,5.9389,60000
2020-01-02 10:00:00+00:00,5.9389,5.9486,5.9372,5.9449,29920
2020-01-02 11:00:00+00:00,5.9486,5.9501,5.9436,5.9436,37800
2020-01-02 12:00:00+00:00,5.9329,5.9481,5.9242,5.9464,614480
2020-01-02 13:00:00+00:00,5.9456,5.9625,5.9247,5.9556,1660520
...,...,...,...,...,...
2025-06-30 19:00:00+00:00,157.7411,158.6510,157.6611,157.8611,26770205
2025-06-30 20:00:00+00:00,157.8611,168.1440,147.8677,157.7811,72869000
2025-06-30 21:00:00+00:00,157.7811,168.1440,147.0664,157.7811,223935
2025-06-30 22:00:00+00:00,157.7611,157.7911,157.4911,157.5113,219009


In [8]:
amd_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/AMD.parquet")
amd_1h = amd_1h.drop(columns=["{}"])
amd_1h["volume"] = amd_1h["volume"].astype("int64")
amd_1h = amd_1h.tz_localize("America/New_York").tz_convert("UTC")
amd_1h

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 09:00:00+00:00,46.30,46.7800,46.27,46.50,19897
2020-01-02 10:00:00+00:00,46.64,47.0000,46.60,46.80,18221
2020-01-02 11:00:00+00:00,46.85,46.9200,46.71,46.79,25050
2020-01-02 12:00:00+00:00,46.76,46.8800,46.49,46.86,235402
2020-01-02 13:00:00+00:00,46.86,46.9500,46.58,46.90,477175
...,...,...,...,...,...
2025-06-30 19:00:00+00:00,141.72,142.2400,141.22,141.89,6328950
2025-06-30 20:00:00+00:00,141.89,144.7858,133.49,141.70,7108561
2025-06-30 21:00:00+00:00,141.65,163.7370,96.73,141.52,72308
2025-06-30 22:00:00+00:00,141.52,141.7500,141.33,141.44,87966


In [9]:
intc_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/INTC.parquet")
intc_1h = intc_1h.tz_localize("America/New_York").tz_convert("UTC")
intc_1h

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 09:00:00+00:00,53.0935,53.1817,53.0935,53.1376,1511
2020-01-02 10:00:00+00:00,53.1376,53.1906,53.1376,53.1376,1460
2020-01-02 11:00:00+00:00,53.1200,53.1288,53.1023,53.1288,1160
2020-01-02 12:00:00+00:00,53.1023,53.2082,53.0141,53.2082,22260
2020-01-02 13:00:00+00:00,53.1729,53.3493,53.1553,53.3317,15756
...,...,...,...,...,...
2025-06-30 19:00:00+00:00,22.4050,22.5200,22.3500,22.3900,10484601
2025-06-30 20:00:00+00:00,22.3900,22.4300,22.3500,22.3899,23150430
2025-06-30 21:00:00+00:00,22.3803,22.4100,22.3600,22.4000,49357
2025-06-30 22:00:00+00:00,22.4000,22.4100,22.2900,22.2900,174280


# For ETF data
ETF after-hours on some days has extra candle at 8:00 PM which probably some adjustment by the data provider. This will be discarded during timezone alignment with NVDA stock data anyway as we use NVDA data shape as reference. This might be seen as a data loss but normally even if after the after-hours, there is overnight trading session before the market open which we can't access anyway, so any adjustment outside of time window we have will show up in the next market open price anyway, so it should be fine.

In [208]:
spy_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/SPY.parquet")
spy_1h[-52:]

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-06-25 20:00:00,605.4496,605.4496,605.4496,605.4496,858269
2025-06-26 04:00:00,606.0479,607.7931,606.0479,607.3843,43645
2025-06-26 05:00:00,607.3843,607.4541,606.9754,607.0851,23633
2025-06-26 06:00:00,607.105,607.8131,606.9455,607.2147,53484
2025-06-26 07:00:00,607.2048,607.7333,607.1549,607.6535,219643
2025-06-26 08:00:00,606.817,607.8036,602.5626,607.3942,574241
2025-06-26 09:00:00,607.3843,607.7134,606.6962,607.4441,6739507
2025-06-26 10:00:00,607.4541,608.9001,607.3942,608.69,12692968
2025-06-26 11:00:00,608.6907,609.1843,608.1521,608.8502,7342907
2025-06-26 12:00:00,608.8402,609.4469,608.7804,609.1643,5322717


In [209]:
spy_1h = spy_1h.tz_localize("America/New_York").tz_convert("UTC")
spy_1h[-52:]

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-06-26 00:00:00+00:00,605.4496,605.4496,605.4496,605.4496,858269
2025-06-26 08:00:00+00:00,606.0479,607.7931,606.0479,607.3843,43645
2025-06-26 09:00:00+00:00,607.3843,607.4541,606.9754,607.0851,23633
2025-06-26 10:00:00+00:00,607.105,607.8131,606.9455,607.2147,53484
2025-06-26 11:00:00+00:00,607.2048,607.7333,607.1549,607.6535,219643
2025-06-26 12:00:00+00:00,606.817,607.8036,602.5626,607.3942,574241
2025-06-26 13:00:00+00:00,607.3843,607.7134,606.6962,607.4441,6739507
2025-06-26 14:00:00+00:00,607.4541,608.9001,607.3942,608.69,12692968
2025-06-26 15:00:00+00:00,608.6907,609.1843,608.1521,608.8502,7342907
2025-06-26 16:00:00+00:00,608.8402,609.4469,608.7804,609.1643,5322717


In [210]:
iwm_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/IWM.parquet")
iwm_1h = iwm_1h.tz_localize("America/New_York").tz_convert("UTC")
iwm_1h

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 09:00:00+00:00,155.1258,155.1258,155.1258,155.1258,130
2020-01-02 10:00:00+00:00,155.2467,155.3677,155.2467,155.3491,8800
2020-01-02 11:00:00+00:00,155.3398,155.3398,155.1630,155.1816,7321
2020-01-02 12:00:00+00:00,155.1444,155.2840,155.1444,155.2654,25378
2020-01-02 13:00:00+00:00,155.2840,155.4514,155.1258,155.4142,57833
...,...,...,...,...,...
2025-06-30 20:00:00+00:00,215.1788,221.9563,214.8099,214.9894,3732264
2025-06-30 21:00:00+00:00,214.9594,215.4481,214.6603,214.8797,334607
2025-06-30 22:00:00+00:00,214.8797,215.1788,214.7800,214.8397,1073467
2025-06-30 23:00:00+00:00,214.7909,215.1988,214.7899,215.0492,64592


In [211]:
dia_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/DIA.parquet")
dia_1h = dia_1h.tz_localize("America/New_York").tz_convert("UTC")
dia_1h

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 09:00:00+00:00,257.7406,257.7766,257.7137,257.7676,1900
2020-01-02 10:00:00+00:00,257.8575,257.9384,257.8575,257.9384,2898
2020-01-02 11:00:00+00:00,257.8575,257.8575,257.8485,257.8485,1000
2020-01-02 12:00:00+00:00,257.6508,257.8395,257.6508,257.8036,4300
2020-01-02 13:00:00+00:00,257.7586,257.8844,257.7227,257.8395,16034
...,...,...,...,...,...
2025-06-30 20:00:00+00:00,438.7541,438.9334,433.3816,438.4653,134571
2025-06-30 21:00:00+00:00,438.6844,439.0827,438.1675,438.5649,110530
2025-06-30 22:00:00+00:00,438.2562,438.8139,438.2562,438.3847,50355
2025-06-30 23:00:00+00:00,438.4056,438.8139,438.3375,438.8139,1041


In [212]:
nvda_1h.to_csv("data/stock_prices/1h/NVDA_UTC.csv")
amd_1h.to_csv("data/stock_prices/1h/AMD_UTC.csv")
intc_1h.to_csv("data/stock_prices/1h/INTC_UTC.csv")

spy_1h.to_csv("data/stock_prices/1h/SPY_UTC.csv")
dia_1h.to_csv("data/stock_prices/1h/DIA_UTC.csv")
iwm_1h.to_csv("data/stock_prices/1h/IWM_UTC.csv")

## 1d

In [149]:
nvda_1d_adj = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d_adj/NVDA.parquet")
amd_1d_adj = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d_adj/AMD.parquet")
intc_1d_adj = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d_adj/INTC.parquet")
spy_1d_adj = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d_adj/SPY.parquet")
dia_1d_adj = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d_adj/DIA.parquet")
iwm_1d_adj = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d_adj/IWM.parquet")

In [165]:
nvda_1h["2024-06-07 0:00":"2024-06-10 0:00"]

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-06-07 08:00:00+00:00,121.0457,121.8454,120.446,121.4945,1403190
2024-06-07 09:00:00+00:00,121.4905,121.6594,120.9188,121.2606,577450
2024-06-07 10:00:00+00:00,121.2606,121.2956,120.7658,120.8108,510440
2024-06-07 11:00:00+00:00,120.8138,120.9458,120.1591,120.6959,1712830
2024-06-07 12:00:00+00:00,120.6059,125.4192,117.6572,118.8527,13434480
2024-06-07 13:00:00+00:00,118.8467,119.7863,104.6499,119.0706,71528320
2024-06-07 14:00:00+00:00,119.0736,119.698,118.5962,119.1596,53627400
2024-06-07 15:00:00+00:00,119.1556,119.3165,118.6568,119.2395,33764850
2024-06-07 16:00:00+00:00,119.2615,121.2376,119.0726,121.2196,56593270
2024-06-07 17:00:00+00:00,121.2276,121.6372,120.7808,121.0079,42641080


In [191]:
nvda_1h = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1h/NVDA.parquet")

In [192]:
nvda_1h.resample("D").agg({"open": "first", "high": "max", "low": "min", "close": "last", "volume": "sum"})

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,5.9003,5.9837,5.8921,5.9750,180502680
2020-01-03,5.8866,5.9197,5.7786,5.8759,157318880
2020-01-04,,,,,0
2020-01-05,,,,,0
2020-01-06,5.7995,5.9090,5.7440,5.9090,209043400
...,...,...,...,...,...
2025-06-26,155.8612,163.7897,142.7319,155.3412,207150109
2025-06-27,155.6812,190.0828,100.1890,157.5512,295668597
2025-06-28,,,,,0
2025-06-29,,,,,0


In [183]:
nvda_1d_adj["2024-04-01 0:00":"2024-05-01 0:00"]["volume"].sum() / 23

np.float64(46128702.782608695)

In [188]:
nvda_1h["2024-06-10 0:00":"2024-07-11 0:00"]["volume"].sum() / (16 * 23)

np.float64(19382528.258152176)

In [189]:
nvda_1d_adj["2024-06-10 0:00":"2024-07-11 0:00"]["volume"].mean()

np.float64(321407078.1363636)

In [143]:
nvda_1d = pd.read_parquet("../phase1/data/alphavantage/stock_prices/1d/NVDA.parquet")
nvda_1d

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,238.750,239.910,236.720,239.91,5941969
2020-01-03,235.100,237.830,234.100,236.07,5144308
2020-01-06,232.320,237.270,231.270,237.06,6572812
2020-01-07,238.200,241.770,236.390,239.93,7980144
2020-01-08,239.760,242.040,238.150,240.38,6931017
...,...,...,...,...,...
2025-06-24,145.560,147.960,145.500,147.90,187566121
2025-06-25,149.270,154.450,149.260,154.31,269146471
2025-06-26,155.975,156.715,154.000,155.02,198145746
2025-06-27,156.040,158.710,155.255,157.75,263234539


In [194]:
# Equal to tradingview volume
5941969 * 40

237678760

Above shows that NVDA 1h timeframe price and volumn is already adjusted for stock splits, no further adjustment needed.

In [None]:
nvda_1d_adj

Unnamed: 0_level_0,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02,238.750,239.910,236.720,239.91,5.971508,5941969,0.0,1.0
2020-01-03,235.100,237.830,234.100,236.07,5.875928,5144308,0.0,1.0
2020-01-06,232.320,237.270,231.270,237.06,5.900569,6572812,0.0,1.0
2020-01-07,238.200,241.770,236.390,239.93,5.972005,7980144,0.0,1.0
2020-01-08,239.760,242.040,238.150,240.38,5.983206,6931017,0.0,1.0
...,...,...,...,...,...,...,...,...
2025-06-24,145.560,147.960,145.500,147.90,147.891653,187566121,0.0,1.0
2025-06-25,149.270,154.450,149.260,154.31,154.301291,269146471,0.0,1.0
2025-06-26,155.975,156.715,154.000,155.02,155.011251,198145746,0.0,1.0
2025-06-27,156.040,158.710,155.255,157.75,157.741097,263234539,0.0,1.0


In [None]:
nvda_1d_adj[nvda_1d_adj["split_coefficient"] > 1.0]

In [None]:
nvda_1d_adj.to_csv("data/stock_prices/1d_adj/NVDA.csv")
amd_1d_adj.to_csv("data/stock_prices/1d_adj/AMD.csv")
intc_1d_adj.to_csv("data/stock_prices/1d_adj/INTC.csv")
spy_1d_adj.to_csv("data/stock_prices/1d_adj/SPY.csv")
dia_1d_adj.to_csv("data/stock_prices/1d_adj/DIA.csv")
iwm_1d_adj.to_csv("data/stock_prices/1d_adj/IWM.csv")

## News sentiment data and insider trading data

In [22]:
# News sentiment data
nvda_news = pd.read_parquet("../phase1/data/alphavantage/news/nvidia_news.parquet")
nvda_news.to_csv("data/NVDA_news.csv")

# Insider trading data
nvda_insider = pd.read_parquet("../phase1/data/alphavantage/insiders/NVDA_insider_transactions_2020_2025.parquet")
nvda_insider.to_csv("data/NVDA_insiders.csv")

## Bitcoin and Gold price data

In [97]:
btc = pd.read_csv("../phase1/data/MT5/BTCUSD_H1_202112310000_202506302300.csv", sep="\t")
gold = pd.read_csv("../phase1/data/MT5/XAUUSD_H1_202112310100_202506302300.csv", sep="\t")

In [98]:
btc

Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>,<VOL>,<SPREAD>
0,2021.12.31,00:00:00,47295.50,47312.50,46727.50,47078.50,3637,0,3000
1,2021.12.31,01:00:00,47076.50,47242.00,46934.50,47119.00,3686,0,3000
2,2021.12.31,02:00:00,47119.00,47400.00,46834.50,47071.00,4129,0,3000
3,2021.12.31,03:00:00,47071.00,47355.50,46842.50,47123.50,3538,0,3000
4,2021.12.31,04:00:00,47123.50,47126.50,46830.50,46994.50,2906,0,3000
...,...,...,...,...,...,...,...,...,...
29591,2025.06.30,19:00:00,107606.65,107865.45,107288.00,107625.89,12028,0,3305
29592,2025.06.30,20:00:00,107623.09,107844.93,107447.52,107491.58,11249,0,3305
29593,2025.06.30,21:00:00,107489.16,107664.54,107156.94,107264.56,11087,0,3305
29594,2025.06.30,22:00:00,107264.65,107820.69,107219.29,107760.75,10943,0,3305


In [99]:
btc[btc["<VOL>"] > 0]

Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>,<VOL>,<SPREAD>
15741,2023.11.13,11:00:00,37030.0,37041.0,36968.24,36998.6,93,5005,0
15742,2023.11.13,12:00:00,36997.04,37008.57,36920.26,36983.0,219,3861,0
23599,2024.10.19,06:00:00,68435.0,69480.0,68413.0,68427.0,732,23,0


In [100]:
gold[gold["<VOL>"] > 0]

Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>,<VOL>,<SPREAD>


In [101]:
btc = btc.drop(columns=["<VOL>", "<SPREAD>"])
gold = gold.drop(columns=["<VOL>", "<SPREAD>"])

In [102]:
print(btc["<DATE>"].dtype)
print(btc["<TIME>"].dtype)

object
object


In [103]:
btc["timestamp"] = pd.to_datetime(btc["<DATE>"] + " " + btc["<TIME>"])
gold["timestamp"] = pd.to_datetime(gold["<DATE>"] + " " + gold["<TIME>"])
btc = btc.set_index("timestamp").drop(columns=["<DATE>", "<TIME>"])
gold = gold.set_index("timestamp").drop(columns=["<DATE>", "<TIME>"])

In [104]:
btc

Unnamed: 0_level_0,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-31 00:00:00,47295.50,47312.50,46727.50,47078.50,3637
2021-12-31 01:00:00,47076.50,47242.00,46934.50,47119.00,3686
2021-12-31 02:00:00,47119.00,47400.00,46834.50,47071.00,4129
2021-12-31 03:00:00,47071.00,47355.50,46842.50,47123.50,3538
2021-12-31 04:00:00,47123.50,47126.50,46830.50,46994.50,2906
...,...,...,...,...,...
2025-06-30 19:00:00,107606.65,107865.45,107288.00,107625.89,12028
2025-06-30 20:00:00,107623.09,107844.93,107447.52,107491.58,11249
2025-06-30 21:00:00,107489.16,107664.54,107156.94,107264.56,11087
2025-06-30 22:00:00,107264.65,107820.69,107219.29,107760.75,10943


In [105]:
gold

Unnamed: 0_level_0,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-31 01:00:00,1815.57,1816.17,1814.15,1815.77,972
2021-12-31 02:00:00,1815.77,1816.94,1815.20,1816.85,1323
2021-12-31 03:00:00,1816.84,1819.00,1815.11,1816.58,4107
2021-12-31 04:00:00,1816.58,1818.03,1814.69,1817.91,2517
2021-12-31 05:00:00,1817.90,1818.91,1816.63,1817.06,1982
...,...,...,...,...,...
2025-06-30 19:00:00,3289.61,3297.37,3289.05,3297.21,10942
2025-06-30 20:00:00,3297.18,3298.50,3291.85,3294.05,10825
2025-06-30 21:00:00,3294.06,3299.52,3292.54,3298.39,8029
2025-06-30 22:00:00,3298.40,3309.47,3297.98,3308.65,10356


In [106]:
btc.index = btc.index

In [118]:
btc[-28:-13]

Unnamed: 0_level_0,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-06-29 20:00:00,107589.01,107648.81,107350.18,107574.45,7735
2025-06-29 21:00:00,107574.66,107761.39,107468.7,107658.29,6998
2025-06-29 22:00:00,107658.72,107679.8,107372.73,107426.77,5740
2025-06-29 23:00:00,107426.35,107534.67,107396.2,107422.09,5838
2025-06-30 00:00:00,107338.9,107756.14,107265.41,107597.56,6639
2025-06-30 01:00:00,107598.65,108316.63,107598.65,108113.71,12578
2025-06-30 02:00:00,108113.42,108471.85,108018.19,108385.64,11379
2025-06-30 03:00:00,108389.68,108812.02,108264.08,108739.82,11214
2025-06-30 04:00:00,108740.39,108809.95,108376.02,108520.06,10518
2025-06-30 05:00:00,108521.72,108696.09,108454.75,108517.49,8959


# TradingView reference for Thai timezone
The peaks of $108800 are on 7:00 AM and 8:00 AM (UTC+7) candles which correspond to 03:00 AM and 04:00 AM in the data we have.
Which means the data we have is in UTC+3 timezone.
![tradingview_ref_thai_tz](<assets/tradingview_ref_thai_tz.png>)
![mt5](<assets/mt5_tz.png>)

In [122]:
btc.tz_localize("Etc/GMT-3")[-28:-13]

Unnamed: 0_level_0,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-06-29 20:00:00+03:00,107589.01,107648.81,107350.18,107574.45,7735
2025-06-29 21:00:00+03:00,107574.66,107761.39,107468.7,107658.29,6998
2025-06-29 22:00:00+03:00,107658.72,107679.8,107372.73,107426.77,5740
2025-06-29 23:00:00+03:00,107426.35,107534.67,107396.2,107422.09,5838
2025-06-30 00:00:00+03:00,107338.9,107756.14,107265.41,107597.56,6639
2025-06-30 01:00:00+03:00,107598.65,108316.63,107598.65,108113.71,12578
2025-06-30 02:00:00+03:00,108113.42,108471.85,108018.19,108385.64,11379
2025-06-30 03:00:00+03:00,108389.68,108812.02,108264.08,108739.82,11214
2025-06-30 04:00:00+03:00,108740.39,108809.95,108376.02,108520.06,10518
2025-06-30 05:00:00+03:00,108521.72,108696.09,108454.75,108517.49,8959


In [137]:
btc.tz_localize("Etc/GMT-3").tz_convert("America/New_York")[1640:1650]

Unnamed: 0_level_0,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-12 22:00:00-05:00,39186.5,39303.0,39107.5,39117.5,1924
2022-03-12 23:00:00-05:00,39117.5,39162.5,39088.5,39113.5,1345
2022-03-13 00:00:00-05:00,39113.5,39167.5,39057.5,39134.5,1908
2022-03-13 01:00:00-05:00,39134.5,39160.0,39119.5,39136.5,264
2022-03-13 03:00:00-04:00,39127.5,39187.5,39051.5,39071.5,1302
2022-03-13 06:00:00-04:00,38963.5,39001.5,38850.0,38983.5,3041
2022-03-13 07:00:00-04:00,38983.5,39013.0,38612.0,38738.0,3776
2022-03-13 08:00:00-04:00,38740.5,38767.0,38366.0,38498.0,5256
2022-03-13 09:00:00-04:00,38498.0,38939.0,38457.5,38739.5,5272
2022-03-13 10:00:00-04:00,38739.5,38929.0,38715.0,38848.0,3385


In [1]:
nvda_1h[720:750]

NameError: name 'nvda_1h' is not defined

In [167]:
nvda_news[["url", "title"]].drop_duplicates()[60:70].tz_localize("UTC").tz_convert("America/New_York")

Unnamed: 0_level_0,url,title
time_published,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-03-09 11:38:00-05:00,https://www.zacks.com/stock/news/1879754/the-z...,The Zacks Analyst Blog Highlights Baker Hughes...
2022-03-09 12:22:53-05:00,https://www.fool.com/investing/2022/03/09/why-...,Why Nvidia Stock Is on Fire Today
2022-03-09 15:01:10-05:00,https://stocknews.com/news/nvda-avgo-better-ch...,NVDA: Better Chip Stock: NVIDIA vs. Broadcom
2022-03-09 16:42:22-05:00,https://www.kiplinger.com/investing/stocks/604...,Stock Market Today: Tech Stocks Lead Relief Rally
2022-03-09 16:53:00-05:00,https://www.zacks.com/stock/news/1879859/techn...,Technically Constructive: Higher Lows on Ukrai...
2022-03-09 18:30:00-05:00,https://www.zacks.com/commentary/1879893/these...,These 3 Household Tech Stocks Can Give Your Po...
2022-03-09 19:22:05-05:00,https://www.cnbc.com/2022/03/09/amazon-split-c...,Amazon split could set it up for being include...
2022-03-10 03:05:42-05:00,https://www.cnbc.com/2022/03/10/vodafone-inves...,Vodafone investigating threat from hackers beh...
2022-03-10 04:00:00-05:00,https://www.barrons.com/articles/chip-shortage...,Commentary: Chip Shortages Are Still Wreaking ...
2022-03-10 04:00:00-05:00,https://www.benzinga.com/pressreleases/22/03/g...,SPEC Establishes Machine Learning Committee to...


In [174]:
nvda_news[["url", "title"]].drop_duplicates()[0:300].tz_localize("UTC").tz_convert("America/New_York")

Unnamed: 0_level_0,url,title
time_published,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-03-03 01:56:19-05:00,https://www.scmp.com/presented/business/topics...,MarketingPulse and eTailingPulse to shed light...
2022-03-03 16:03:36-05:00,https://www.kiplinger.com/investing/stocks/604...,20 Stocks Billionaires Are Selling
2022-03-04 12:22:00-05:00,https://www.zacks.com/stock/news/1877770/the-t...,The Top 5 Investment Plays for Blockchain
2022-03-04 14:36:00-05:00,https://www.thestreet.com/investing/nvidia-fac...,Nvidia Faces an Unusual Demand That Threatens ...
2022-03-05 19:00:00-05:00,https://www.economist.com/business/2022/03/06/...,"Amid Russia's war, America Inc reckons with th..."
...,...,...
2022-03-23 14:51:01-04:00,https://www.benzinga.com/pressreleases/22/03/g...,"Dedicated Computing Announces M1000, Offering ..."
2022-03-23 15:19:01-04:00,https://stockmarket.com/featured/3-top-electri...,3 Top Electric Vehicle Stocks To Watch Ahead O...
2022-03-23 16:15:00-04:00,https://www.benzinga.com/pressreleases/22/03/g...,Mercury's new rugged distributed processing so...
2022-03-23 16:15:00-04:00,https://www.globenewswire.com/news-release/202...,Mercury's new rugged distributed processing so...


In [None]:
nvda_news[["authors"]].duplicated().sum()