In [None]:
# Instalar librerías necesarias (solo si hace falta)
!pip install yfinance pyarrow ccxt
!pip install antropy

Collecting antropy
  Downloading antropy-0.1.9-py3-none-any.whl.metadata (6.6 kB)
Downloading antropy-0.1.9-py3-none-any.whl (18 kB)
Installing collected packages: antropy
Successfully installed antropy-0.1.9


In [None]:
# Conectar con Google Drive para guardar datos
import yfinance as yf
import pandas as pd
import os
from google.colab import drive

drive.mount("/content/drive")
base_dir = "/content/drive/MyDrive/btc_prediction_project"
data_dir = os.path.join(base_dir, "data")
os.makedirs(data_dir, exist_ok=True)

btc_price = yf.download("BTC-USD", start="2010-01-01", end="2025-06-21", interval="1d")
btc_price.to_parquet(f"{data_dir}/btc_yfinance.parquet")

# Mostrar últimas filas como validación
btc_price.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  btc_price = yf.download("BTC-USD", start="2010-01-01", end="2025-06-21", interval="1d")
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2014-09-17,457.334015,468.174011,452.421997,465.864014,21056800
2014-09-18,424.440002,456.859985,413.104004,456.859985,34483200
2014-09-19,394.79599,427.834991,384.532013,424.102997,37919700
2014-09-20,408.903992,423.29599,389.882996,394.673004,36863600
2014-09-21,398.821014,412.425995,393.181,408.084991,26580100


In [None]:
import pandas as pd
import os
from google.colab import drive
import numpy as np
from sklearn.cluster import KMeans
from antropy import sample_entropy
from scipy.optimize import differential_evolution

In [None]:
base_dir = "/content/drive/MyDrive/btc_prediction_project"
data_dir = os.path.join(base_dir, "data")
parquet_path = os.path.join(data_dir, "btc_yfinance.parquet")
csv_path = os.path.join(data_dir, "btc_yfinance.csv")

# 4. Leer el archivo Parquet
btc_df = pd.read_parquet(parquet_path)

# 5. Guardarlo como CSV
btc_df.to_csv(csv_path)

# 6. Mostrar una muestra del dataframe como validación
btc_df.head()

Price,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2014-09-17,457.334015,468.174011,452.421997,465.864014,21056800
2014-09-18,424.440002,456.859985,413.104004,456.859985,34483200
2014-09-19,394.79599,427.834991,384.532013,424.102997,37919700
2014-09-20,408.903992,423.29599,389.882996,394.673004,36863600
2014-09-21,398.821014,412.425995,393.181,408.084991,26580100


Las columnas de un DataFrame pueden tener múltiples niveles cuando los datos provienen de fuentes que organizan la información jerárquicamente, como es el caso de yfinance al descargar datos financieros, donde se incluyen tanto la categoría del dato (por ejemplo, "Price") como el ticker (por ejemplo, "BTC-USD"). Este formato se representa en pandas como un MultiIndex en las columnas, lo que dificulta trabajar directamente con ellas en muchos casos. Aplanar las columnas significa convertir ese índice jerárquico en un índice simple, combinando los niveles en un solo nombre de columna, por ejemplo, uniendo la categoría y el ticker con un guion bajo. Esto facilita el manejo del DataFrame, permitiendo acceder a cada columna de manera directa y evitar confusiones por nombres repetidos. Si el DataFrame ya está aplanado, volver a aplanarlo podría provocar que los nombres se concatenen repetidamente, por eso se verifica si el índice es un MultiIndex antes de hacer esta transformación.

In [None]:
# Aplana las columnas uniendo los niveles del MultiIndex
if isinstance(btc_df.columns, pd.MultiIndex):
    btc_df.columns = [f"{col[0]}_{col[1]}" for col in btc_df.columns]
    print("Columnas aplanadas...")
else:
    print("Las columnas ya están aplanadas.")

Columnas aplanadas...


In [None]:
# Usamos directamente 'Close_BTC-USD'
btc_df = btc_df[["Close_BTC-USD"]].dropna().reset_index(drop=True)

# Dividir en 3 partes
total_len = len(btc_df)
tercio = total_len // 3

parte_a = btc_df.iloc[:tercio].copy()
parte_b = btc_df.iloc[tercio:2*tercio].copy()
parte_c = btc_df.iloc[2*tercio:].copy()

print(f"Total: {total_len} — Parte A: {len(parte_a)}, B: {len(parte_b)}, C: {len(parte_c)}")

# Extraer ventanas deslizantes z-normalizadas usando 'Close_BTC-USD'
def extract_windows(data, window_size):
    windows = []
    for i in range(len(data) - window_size):
        w = data["Close_BTC-USD"].iloc[i:i+window_size].values
        w = (w - w.mean()) / w.std()
        windows.append(w)
    return np.array(windows)

windows_180 = extract_windows(parte_a, 180)
windows_360 = extract_windows(parte_a, 360)
windows_720 = extract_windows(parte_a, 720)

print("Ventanas 180:", windows_180.shape)
print("Ventanas 360:", windows_360.shape)
print("Ventanas 720:", windows_720.shape)


Total: 3930 — Parte A: 1310, B: 1310, C: 1310
Ventanas 180: (1130, 180)
Ventanas 360: (950, 360)
Ventanas 720: (590, 720)


In [None]:
def fit_kmeans(windows, n_clusters=100):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(windows)
    return kmeans.cluster_centers_, kmeans

centers_180, kmeans_180 = fit_kmeans(windows_180)
centers_360, kmeans_360 = fit_kmeans(windows_360)
centers_720, kmeans_720 = fit_kmeans(windows_720)

print("Centros obtenidos:", centers_180.shape, centers_360.shape, centers_720.shape)


Centros obtenidos: (100, 180) (100, 360) (100, 720)


In [None]:
def select_top_entropy(centers, top_k=20):
    entropias = [sample_entropy(c) for c in centers]
    idx = np.argsort(entropias)[-top_k:]
    return centers[idx], np.array(entropias)[idx]

top20_180, entrop_180 = select_top_entropy(centers_180)
top20_360, entrop_360 = select_top_entropy(centers_360)
top20_720, entrop_720 = select_top_entropy(centers_720)

print("Top entropías 180:", entrop_180)


Top entropías 180: [0.28228118 0.29128964 0.29425384 0.2954217  0.30596878 0.30924785
 0.32034076 0.33234284 0.33498174 0.34866751 0.3530472  0.35321442
 0.36092725 0.37301794 0.38802483 0.39639475 0.43894623 0.44468582
 0.50411713 0.65370003]


In [None]:
def dp_vector(window, patterns):
    sim = np.array([np.dot(window, p)/(np.linalg.norm(window)*np.linalg.norm(p)) for p in patterns])
    return sim

def bayesian_error(weights, X, y):
    preds = X.dot(weights)
    return np.mean((preds - y)**2) + 1e-6*np.sum(weights**2)


In [None]:
def build_design_matrix(data, patterns, window_size):
    X, y = [], []
    closes = data["Close_BTC-USD"].values
    for i in range(len(data)-window_size-1):
        w = closes[i:i+window_size]
        w = (w - w.mean())/w.std()
        dp = dp_vector(w, patterns)
        X.append(dp)
        y.append(closes[i+window_size+1] - closes[i+window_size])
    return np.array(X), np.array(y)

Xb_180, yb_180 = build_design_matrix(parte_b, top20_180, 180)


In [None]:
res = differential_evolution(
    lambda w: bayesian_error(w, Xb_180, yb_180),
    bounds=[(-1,1)] * Xb_180.shape[1],
    maxiter=200, seed=42
)
w_opt_180 = res.x

In [None]:
def predict_and_eval(data, patterns, weights, window_size):
    closes = data["Close_BTC-USD"].values
    preds, actual = [], []
    for i in range(len(data)-window_size-1):
        w = closes[i:i+window_size]
        w = (w - w.mean())/w.std()
        dp = dp_vector(w, patterns)
        pred = dp.dot(weights)
        preds.append(pred)
        actual.append(closes[i+window_size+1] - closes[i+window_size])
    preds, actual = np.array(preds), np.array(actual)
    acc = np.mean(np.sign(preds) == np.sign(actual))
    return acc, preds, actual

acc_180, pred_180, act_180 = predict_and_eval(parte_c, top20_180, w_opt_180, 180)
print(f"Accuracy 180 días: {acc_180:.2%}")

Accuracy 180 días: 50.84%


In [None]:
# Usamos directamente 'Close_BTC-USD'
btc_df = btc_df[["Close_BTC-USD"]].dropna().reset_index(drop=True)

# Dividir en 3 partes
total_len = len(btc_df)
tercio = total_len // 3

parte_a = btc_df.iloc[:tercio].copy()
parte_b = btc_df.iloc[tercio:2*tercio].copy()
parte_c = btc_df.iloc[2*tercio:].copy()

print(f"Total: {total_len} — Parte A: {len(parte_a)}, B: {len(parte_b)}, C: {len(parte_c)}")

# Extraer ventanas deslizantes z-normalizadas usando 'Close_BTC-USD'
def extract_windows(data, window_size):
    windows = []
    for i in range(len(data) - window_size):
        w = data["Close_BTC-USD"].iloc[i:i+window_size].values
        w = (w - w.mean()) / w.std()
        windows.append(w)
    return np.array(windows)

windows_180 = extract_windows(parte_a, 180)
windows_360 = extract_windows(parte_a, 360)
windows_720 = extract_windows(parte_a, 720)

print("Ventanas 180:", windows_180.shape)
print("Ventanas 360:", windows_360.shape)
print("Ventanas 720:", windows_720.shape)
#-------------------------------------------------------

def fit_kmeans(windows, n_clusters=100):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(windows)
    return kmeans.cluster_centers_, kmeans

centers_180, kmeans_180 = fit_kmeans(windows_180)
centers_360, kmeans_360 = fit_kmeans(windows_360)
centers_720, kmeans_720 = fit_kmeans(windows_720)

print("Centros obtenidos:", centers_180.shape, centers_360.shape, centers_720.shape)

#-------------------------------------------------------

def select_top_entropy(centers, top_k=20):
    entropias = [sample_entropy(c) for c in centers]
    idx = np.argsort(entropias)[-top_k:]
    return centers[idx], np.array(entropias)[idx]

top20_180, entrop_180 = select_top_entropy(centers_180)
top20_360, entrop_360 = select_top_entropy(centers_360)
top20_720, entrop_720 = select_top_entropy(centers_720)

print("Top entropías 180:", entrop_180)

#-------------------------------------------------------

def dp_vector(window, patterns):
    sim = np.array([np.dot(window, p)/(np.linalg.norm(window)*np.linalg.norm(p)) for p in patterns])
    return sim

def bayesian_error(weights, X, y):
    preds = X.dot(weights)
    return np.mean((preds - y)**2) + 1e-6*np.sum(weights**2)

#-------------------------------------------------------

def build_design_matrix(data, patterns, window_size):
    X, y = [], []
    closes = data["Close_BTC-USD"].values
    for i in range(len(data)-window_size-1):
        w = closes[i:i+window_size]
        w = (w - w.mean())/w.std()
        dp = dp_vector(w, patterns)
        X.append(dp)
        y.append(closes[i+window_size+1] - closes[i+window_size])
    return np.array(X), np.array(y)

Xb_180, yb_180 = build_design_matrix(parte_b, top20_180, 180)

#-------------------------------------------------------

res = differential_evolution(
    lambda w: bayesian_error(w, Xb_180, yb_180),
    bounds=[(-1,1)] * Xb_180.shape[1],
    maxiter=200, seed=42
)
w_opt_180 = res.x


#-------------------------------------------------------

def predict_and_eval(data, patterns, weights, window_size):
    closes = data["Close_BTC-USD"].values
    preds, actual = [], []
    for i in range(len(data)-window_size-1):
        w = closes[i:i+window_size]
        w = (w - w.mean())/w.std()
        dp = dp_vector(w, patterns)
        pred = dp.dot(weights)
        preds.append(pred)
        actual.append(closes[i+window_size+1] - closes[i+window_size])
    preds, actual = np.array(preds), np.array(actual)
    acc = np.mean(np.sign(preds) == np.sign(actual))
    return acc, preds, actual

acc_180, pred_180, act_180 = predict_and_eval(parte_c, top20_180, w_opt_180, 180)
print(f"Accuracy 180 días: {acc_180:.2%}")

Total: 3930 — Parte A: 1310, B: 1310, C: 1310
Ventanas 180: (1130, 180)
Ventanas 360: (950, 360)
Ventanas 720: (590, 720)
Centros obtenidos: (100, 180) (100, 360) (100, 720)
Top entropías 180: [0.28228118 0.29128964 0.29425384 0.2954217  0.30596878 0.30924785
 0.32034076 0.33234284 0.33498174 0.34866751 0.3530472  0.35321442
 0.36092725 0.37301794 0.38802483 0.39639475 0.43894623 0.44468582
 0.50411713 0.65370003]
Accuracy 180 días: 50.84%
