In [1]:
import pandas as pd
import numpy as np

# 1. Cargar CSV
df = pd.read_csv("../data/bbva_data.csv")

# 2. Eliminar posibles filas corruptas (cabeceras repetidas)
df = df[~df["Price"].isin(["Ticker", "Date"])].copy() if "Price" in df.columns else df

# 3. Detectar si la primera columna es la fecha y renombrarla si hace falta
if "Price" in df.columns:
    df = df.rename(columns={"Price": "Date"})

# 4. Convertir a tipos correctos
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
for col in ["Close", "High", "Low", "Open", "Volume"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# 5. Crear columna estándar “Price” (igual a Close)
df["Price"] = df["Close"]

# 6. Crear variables financieras derivadas
df["Return"] = df["Price"].pct_change()                   # variación porcentual diaria
df["LogReturn"] = np.log1p(df["Return"])                  # log-return
df["Volatility"] = (df["High"] - df["Low"]) / df["Low"]   # rango intradía
df["Weekday"] = df["Date"].dt.dayofweek                   # día de la semana (0=lunes)
df["Month"] = df["Date"].dt.month                         # mes (para estacionalidad)

# 7. Ordenar y limpiar nulos
df = df.sort_values("Date").reset_index(drop=True)
df = df.dropna(subset=["Price"])

print(df.head())
print(df.dtypes)

df_limpio = df.copy()

# 8. Guardar CSV limpio
df_limpio.to_csv("../data/bbva_data_limpio.csv", index=False)

        Date     Close      High       Low      Open    Volume     Price  \
0 2000-01-03  4.115112  4.155741  4.106406  4.135427   8244257  4.115112   
1 2000-01-04  4.007734  4.088991  3.993224  4.062873   8522096  4.007734   
2 2000-01-05  3.917770  3.990321  3.900358  3.970007  12159826  3.917770   
3 2000-01-06  3.917770  3.917770  3.917770  3.917770         0  3.917770   
4 2000-01-07  3.967106  4.001930  3.926477  4.001930  62261944  3.967106   

     Return  LogReturn  Volatility  Weekday  Month  
0       NaN        NaN    0.012014        0      1  
1 -0.026094  -0.026440    0.023983        1      1  
2 -0.022448  -0.022703    0.023065        2      1  
3  0.000000   0.000000    0.000000        3      1  
4  0.012593   0.012514    0.019217        4      1  
Date          datetime64[ns]
Close                float64
High                 float64
Low                  float64
Open                 float64
Volume                 int64
Price                float64
Return               f

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 0) Cargar
df_limpio = pd.read_csv("../data/bbva_data_limpio.csv")

# 0bis) Asegurar columnas base por si el CSV no las trae creadas
if "Price" not in df_limpio.columns:
    df_limpio["Price"] = df_limpio["Close"]

if "Return" not in df_limpio.columns:
    df_limpio["Return"] = df_limpio["Price"].pct_change()

if "LogReturn" not in df_limpio.columns:
    df_limpio["LogReturn"] = np.log1p(df_limpio["Return"])

if "Volatility" not in df_limpio.columns:
    df_limpio["Volatility"] = (df_limpio["High"] - df_limpio["Low"]) / df_limpio["Low"]

if "Date" in df_limpio.columns:
    df_limpio["Date"] = pd.to_datetime(df_limpio["Date"], errors="coerce")
    if "Weekday" not in df_limpio.columns:
        df_limpio["Weekday"] = df_limpio["Date"].dt.dayofweek
    if "Month" not in df_limpio.columns:
        df_limpio["Month"] = df_limpio["Date"].dt.month

# 1) Derivadas esenciales
df_limpio["HL_PCT"] = (df_limpio["High"] - df_limpio["Low"]) / df_limpio["Close"]
df_limpio["CO_PCT"] = (df_limpio["Close"] - df_limpio["Open"]) / df_limpio["Open"]

for k in [1, 2, 3, 5]:
    df_limpio[f"RET_L{k}"] = df_limpio["Return"].shift(k)

df_limpio["Return_T+1"] = df_limpio["Return"].shift(-1)
df_limpio["Direction_T+1"] = (df_limpio["Return_T+1"] > 0).astype("int8")

df_limpio = df_limpio.dropna().reset_index(drop=True)

# 2) Selección de variables
feature_cols = [
    "Price", "HL_PCT", "CO_PCT",
    "Return", "LogReturn", "Volatility",
    "RET_L1", "RET_L2", "RET_L3", "RET_L5",
    "Weekday", "Month"
]
target_col = "Return_T+1"

# 3) Escalado (usar df_limpio, NO df)
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(df_limpio[feature_cols].values)
y = df_limpio[[target_col]].values

# 4) Ventanas
def make_windows(X, y, lookback=60):
    Xw, yw = [], []
    for i in range(lookback, len(X)):
        Xw.append(X[i-lookback:i, :])
        yw.append(y[i, 0])
    return np.array(Xw), np.array(yw)

X, y = make_windows(X_scaled, y, lookback=60)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (6568, 60, 12)
y shape: (6568,)


In [3]:
# Split 80/20 temporal (sin shuffle)
n = len(X)
n_train = int(n * 0.8)

X_train = X[:n_train]
y_train = y[:n_train]

X_test  = X[n_train:]
y_test  = y[n_train:]

print("Train:", X_train.shape, y_train.shape)
print("Test: ", X_test.shape,  y_test.shape)

Train: (5254, 60, 12) (5254,)
Test:  (1314, 60, 12) (1314,)
