# import libraries

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import pandas as pd
import requests
from statsmodels.stats.outliers_influence import variance_inflation_factor

# get ETH price from binance

In [None]:
def fetch_ohlcv_binance(symbol="ETHUSDT", interval="1d",
                        start="2022-01-01", end="2025-12-10"):

    url = "https://api.binance.com/api/v3/klines"
    start_ms = int(pd.Timestamp(start).timestamp() * 1000)
    end_ms   = int(pd.Timestamp(end).timestamp() * 1000)

    all_rows = []
    current = start_ms

    while current < end_ms:
        params = {
            "symbol": symbol,
            "interval": interval,
            "startTime": current,
            "endTime": end_ms,
            "limit": 1000
        }
        r = requests.get(url, params=params)
        data = r.json()

        if not data:
            break

        all_rows.extend(data)

        last_time = data[-1][0]
        if last_time >= end_ms:
            break

        current = last_time + 1

    cols = ["open_time","open","high","low","close","volume",
            "close_time","qav","num_trades","taker_base","taker_quote","ignore"]

    df = pd.DataFrame(all_rows, columns=cols)

    df["timestamp"] = pd.to_datetime(df["open_time"], unit="ms")

    for c in ["open","high","low","close","volume"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df[["timestamp","open","high","low","close","volume"]]

    # Cắt dữ liệu vượt quá endTime
    df = df[df["timestamp"] < pd.to_datetime(end)]

    df = df.sort_values("timestamp").reset_index(drop=True)
    return df

df = fetch_ohlcv_binance()
df.to_csv("ETH_dataset.csv", index=False)