In [3]:
!pip install binance-historical-data



In [4]:
# Paso 1: Librerías necesarias
import os
import pandas as pd
import datetime
from binance_historical_data import BinanceDataDumper
from google.colab import drive

# Paso 2: Conectar Google Drive
drive.mount("/content/drive")

# Paso 3: Definir rutas del proyecto
base_dir = "/content/drive/MyDrive/btc_prediction_project"
data_dir = os.path.join(base_dir, "data")
os.makedirs(data_dir, exist_ok=True)

btc_parquet_path = os.path.join(data_dir, "btc_5s.parquet")
binance_raw_dir = os.path.join(data_dir, "binance_raw_1s")  # <— Aquí se guardan los CSVs 1s

# Paso 4: Verificar si ya existe el parquet guardado
if os.path.exists(btc_parquet_path):
    print("✅ Cargando datos 5s desde Drive (ya existen)...")
    df_5s = pd.read_parquet(btc_parquet_path)

else:
    print("⬇️ Descargando datos de Binance (1s)...")

    # Paso 5: Descargar solo datos 1s de aggTrades
    dumper = BinanceDataDumper(
        path_dir_where_to_dump=binance_raw_dir,
        asset_class="spot",
        data_type="aggTrades",
        data_frequency="1s"
    )

    dumper.dump_data(
        tickers=["BTCUSDT"],
        date_start=datetime.date(2024, 1, 1),
        date_end=None,
        is_to_update_existing=False
    )



Mounted at /content/drive
✅ Cargando datos 5s desde Drive (ya existen)...


In [None]:
import os
import pandas as pd
import glob
from google.colab import drive

# Paso 1: Montar Google Drive
drive.mount("/content/drive")

# Paso 2: Definir rutas
base_dir = "/content/drive/MyDrive/btc_prediction_project"
data_dir = os.path.join(base_dir, "data")
btc_parquet_path = os.path.join(data_dir, "btc_5s.parquet")
binance_raw_dir = os.path.join(data_dir, "binance_raw_1s")  # Asegúrate de que los datos estén aquí

# Paso 3: Buscar archivos CSV automáticamente
csv_search_path = os.path.join(binance_raw_dir, "**", "BTCUSDT", "*.csv")
found_files = glob.glob(csv_search_path, recursive=True)

if not found_files:
    raise FileNotFoundError(f"⚠️ No se encontraron archivos CSV en: {csv_search_path}")

# Paso 4: Carpeta con los CSVs
csv_folder = os.path.dirname(found_files[0])
print(f"✅ Carpeta CSV detectada: {csv_folder}")

# Paso 5: Leer y combinar los CSVs con columnas definidas manualmente
column_names = [
    "agg_trade_id", "price", "quantity",
    "first_trade_id", "last_trade_id",
    "timestamp", "is_buyer_maker", "is_best_match"
]

all_files = sorted([os.path.join(csv_folder, f) for f in os.listdir(csv_folder) if f.endswith(".csv")])
df_list = [pd.read_csv(f, header=None, names=column_names) for f in all_files]
df_all = pd.concat(df_list, ignore_index=True)

# Paso 6: Procesar timestamps y reindexar
df_all["timestamp"] = pd.to_datetime(df_all["timestamp"], unit="ms")
df_all = df_all.set_index("timestamp")

# Paso 7: Re-muestrear a 5 segundos
df_5s = df_all["price"].astype(float).resample("5S").last().dropna().to_frame(name="price")

# Paso 8: Guardar en formato parquet
df_5s.to_parquet(btc_parquet_path)
print(f"✅ Datos re-muestreados guardados en: {btc_parquet_path}")

# Paso 9: Mostrar ejemplo
print(df_5s.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Carpeta CSV detectada: /content/drive/MyDrive/btc_prediction_project/data/binance_raw_1s/aggTrades/BTCUSDT


In [None]:
df_5s = df_all["price"].astype(float).resample("5s").last().dropna().to_frame(name="price")

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ---------- FUNCIONES ------------

def sample_entropy(time_series, m=2, r=0.2):
    def _phi(m):
        x = np.array([time_series[i:i+m] for i in range(len(time_series)-m+1)])
        C = np.sum(np.max(np.abs(x[:, None] - x[None, :]), axis=2) <= r*np.std(time_series), axis=0) - 1
        return np.sum(C) / (len(time_series)-m+1) / (len(time_series)-m)
    return -np.log(_phi(m+1) / _phi(m))


def extract_windows(data, window_size):
    windows = []
    prices = data["price"].values
    for i in range(len(prices) - window_size):
        w = prices[i:i+window_size]
        w = (w - w.mean()) / w.std()
        windows.append(w)
    return np.array(windows)


def fit_kmeans(windows, n_clusters=100):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(windows)
    return kmeans.cluster_centers_, kmeans


def select_top_entropy(centers, top_k=20):
    entropias = []
    for c in centers:
        try:
            ent = sample_entropy(c)
        except:
            ent = 0
        entropias.append(ent)
    entropias = np.array(entropias)
    idx = np.argsort(entropias)[-top_k:]
    return centers[idx], entropias[idx]


def dp_vector(window, patterns):
    return np.array([
        np.dot(window, p) / (np.linalg.norm(window) * np.linalg.norm(p))
        for p in patterns
    ])


def build_design_matrix_classification(data, patterns, window_size):
    X, y = [], []
    closes = data["price"].values
    for i in range(len(closes) - window_size - 1):
        w = closes[i:i+window_size]
        w = (w - w.mean()) / w.std()
        dp = dp_vector(w, patterns)
        X.append(dp)
        y.append(int(closes[i+window_size+1] > closes[i+window_size]))  # 1 si sube
    return np.array(X), np.array(y)


def predict_and_eval_classification(data, patterns, model, window_size):
    closes = data["price"].values
    preds, actual = [], []
    for i in range(len(closes) - window_size - 1):
        w = closes[i:i+window_size]
        w = (w - w.mean()) / w.std()
        dp = dp_vector(w, patterns)
        pred = model.predict([dp])[0]
        preds.append(pred)
        actual.append(int(closes[i+window_size+1] > closes[i+window_size]))
    acc = accuracy_score(actual, preds)
    return acc, np.array(preds), np.array(actual)

# ---------- PIPELINE ------------

# Preprocesamiento
df_5s = df_5s[["price"]].dropna().sort_index().reset_index(drop=False)

total_len = len(df_5s)
tercio = total_len // 3

parte_a = df_5s.iloc[:tercio].copy()
parte_b = df_5s.iloc[tercio:2*tercio].copy()
parte_c = df_5s.iloc[2*tercio:].copy()

print(f"Total: {total_len} — Parte A: {len(parte_a)}, B: {len(parte_b)}, C: {len(parte_c)}")

# 1. Ventanas y clustering
windows_180 = extract_windows(parte_a, 180)
centers_180, _ = fit_kmeans(windows_180, n_clusters=100)
top20_180, entrop_180 = select_top_entropy(centers_180, top_k=20)
print("Entropías seleccionadas:", entrop_180)

# 2. Matriz de diseño para clasificación
Xb, yb = build_design_matrix_classification(parte_b, top20_180, 180)

# 3. Entrenamiento
clf = LogisticRegression(max_iter=1000)
clf.fit(Xb, yb)

# 4. Evaluación
acc, preds, actuals = predict_and_eval_classification(parte_c, top20_180, clf, 180)
print(f"Precisión de dirección (clasificación): {acc:.2%}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.plot(actuals[:500], label="Real", alpha=0.7)
plt.plot(preds[:500], label="Predicción", alpha=0.7)
plt.title("Predicción vs Real (primeros 500 casos)")
plt.legend()
plt.grid()
plt.show()
