In [22]:
import numpy as np
import pandas as pd
import datetime
import ta
from sklearn.preprocessing import StandardScaler
import dateparser

In [9]:
data.to_csv('amzn_data.csv', index=True)

In [11]:
df = pd.read_csv('amzn_data.csv')
df.head()

Unnamed: 0,Price,Close,High,Low,Open,Volume
0,2010-10-22,8.4565,8.5085,8.1135,8.1225,326368000
1,2010-10-25,8.45,8.5995,8.416,8.5785,130618000
2,2010-10-26,8.4975,8.55,8.375,8.3785,93792000
3,2010-10-27,8.3755,8.4875,8.327,8.4455,114436000
4,2010-10-28,8.342,8.4245,8.2525,8.4155,93688000


In [16]:
def technical_indicators(df):
    """
    Adds trend, momentum, volatility and volume indicators to the DataFrame.
    Input: DataFrame with columns ['Open', 'High', 'Low', 'Close', 'Volume']
    Output: New DataFrame with additional columns for each indicator
    """
    data = df.copy()

    # ========== TENDENCIA ==========
    data['MA_20'] = ta.trend.SMAIndicator(data['Close'], window=20).sma_indicator()
    data['EMA_20'] = ta.trend.EMAIndicator(data['Close'], window=20).ema_indicator()

    macd = ta.trend.MACD(data['Close'])
    data['MACD'] = macd.macd()
    data['MACD_signal'] = macd.macd_signal()

    adx = ta.trend.ADXIndicator(data['High'], data['Low'], data['Close'], window=14)
    data['ADX'] = adx.adx()

    psar = ta.trend.PSARIndicator(data['High'], data['Low'], data['Close'])
    data['Parabolic_SAR'] = psar.psar()

    ichimoku = ta.trend.IchimokuIndicator(data['High'], data['Low'])
    data['Ichimoku_base'] = ichimoku.ichimoku_base_line()
    data['Ichimoku_conversion'] = ichimoku.ichimoku_conversion_line()

    # ========== MOMENTUM ==========
    data['RSI'] = ta.momentum.RSIIndicator(data['Close']).rsi()
    data['ROC'] = ta.momentum.ROCIndicator(data['Close']).roc()
    data['Stoch'] = ta.momentum.StochasticOscillator(data['High'], data['Low'], data['Close']).stoch()
    data['CCI'] = ta.trend.CCIIndicator(data['High'], data['Low'], data['Close']).cci()
    data['Momentum'] = data['Close'].diff(10)
    data['Williams_%R'] = ta.momentum.WilliamsRIndicator(data['High'], data['Low'], data['Close']).williams_r()
    data['AO'] = ta.momentum.AwesomeOscillatorIndicator(data['High'], data['Low']).awesome_oscillator()

    # ========== VOLATILIDAD ==========
    data['ATR'] = ta.volatility.AverageTrueRange(data['High'], data['Low'], data['Close']).average_true_range()

    bb = ta.volatility.BollingerBands(data['Close'])
    data['BB_high'] = bb.bollinger_hband()
    data['BB_low'] = bb.bollinger_lband()

    kc = ta.volatility.KeltnerChannel(data['High'], data['Low'], data['Close'])
    data['KC_high'] = kc.keltner_channel_hband()
    data['KC_low'] = kc.keltner_channel_lband()

    dc = ta.volatility.DonchianChannel(data['High'], data['Low'], data['Close'])
    data['Donchian_high'] = dc.donchian_channel_hband()
    data['Donchian_low'] = dc.donchian_channel_lband()

    n = 10  # periodo
    hl_range = data['High'] - data['Low']
    data['Chaikin_Volatility'] = (
        (hl_range.ewm(span=n).mean() - hl_range.ewm(span=n).mean().shift(n))
        / hl_range.ewm(span=n).mean().shift(n)
    ) * 100

    # ========== VOLUMEN ==========
    data['OBV'] = ta.volume.OnBalanceVolumeIndicator(data['Close'], data['Volume']).on_balance_volume()
    data['VROC'] = data['Volume'].pct_change(10)
    data['MFI'] = ta.volume.MFIIndicator(data['High'], data['Low'], data['Close'], data['Volume']).money_flow_index()
    data['CMF'] = ta.volume.ChaikinMoneyFlowIndicator(data['High'], data['Low'], data['Close'], data['Volume']).chaikin_money_flow()
    data['AD'] = ta.volume.AccDistIndexIndicator(data['High'], data['Low'], data['Close'], data['Volume']).acc_dist_index()
    data['EOM'] = ta.volume.EaseOfMovementIndicator(data['High'], data['Low'], data['Volume']).ease_of_movement()

    # ========== OTROS ==========
    data['Pivot_Point'] = (data['High'] + data['Low'] + data['Close']) / 3
    data['VWAP'] = ta.volume.VolumeWeightedAveragePrice(data['High'], data['Low'], data['Close'], data['Volume']).volume_weighted_average_price()
    data['ATR_Bands_high'] = data['Close'] + 2 * data['ATR']
    data['ATR_Bands_low'] = data['Close'] - 2 * data['ATR']

    # SuperTrend (custom calc, since ta doesn’t have built-in)
    multiplier = 3
    atr = data['ATR']
    hl2 = (data['High'] + data['Low']) / 2
    data['SuperTrend'] = hl2 - (multiplier * atr)

    # Ulcer Index (custom calc)
    drawdown = (data['Close'] / data['Close'].cummax() - 1) * 100
    data['Ulcer_Index'] = (drawdown ** 2).rolling(14).mean() ** 0.5

    return data


In [17]:
data_ind = technical_indicators(df)

In [18]:
data_ind

Unnamed: 0,Price,Close,High,Low,Open,Volume,MA_20,EMA_20,MACD,MACD_signal,...,MFI,CMF,AD,EOM,Pivot_Point,VWAP,ATR_Bands_high,ATR_Bands_low,SuperTrend,Ulcer_Index
0,2010-10-22,8.456500,8.508500,8.113500,8.122500,326368000,,,,,...,,,2.404382e+08,,8.359500,,8.456500,8.456500,8.311000,
1,2010-10-25,8.450000,8.599500,8.416000,8.578500,130618000,,,,,...,,,1.582230e+08,0.027641,8.488500,,8.450000,8.450000,8.507750,
2,2010-10-26,8.497500,8.550000,8.375000,8.378500,93792000,,,,,...,,,1.957401e+08,-0.008443,8.474167,,8.497500,8.497500,8.462500,
3,2010-10-27,8.375500,8.487500,8.327000,8.445500,114436000,,,,,...,,,1.504647e+08,-0.007749,8.396667,,8.375500,8.375500,8.407250,
4,2010-10-28,8.342000,8.424500,8.252500,8.415500,93688000,,,,,...,,,1.542775e+08,-0.012622,8.339667,,8.342000,8.342000,8.338500,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,2025-10-15,215.570007,217.710007,212.660004,216.619995,45909500,221.876001,221.925020,-2.393011,-1.874987,...,40.117974,-0.057822,1.385612e+10,-8.524946,215.313339,220.371555,226.408541,204.731474,198.927205,8.972261
3768,2025-10-16,214.470001,218.589996,212.809998,215.669998,42414600,221.038001,221.215018,-2.684216,-2.036833,...,33.563162,-0.077310,1.383807e+10,7.017988,215.289998,220.081025,225.360068,203.579935,199.364897,9.150391
3769,2025-10-17,213.039993,214.800003,211.029999,214.559998,45986900,220.116001,220.436444,-2.995853,-2.228637,...,26.538469,-0.057311,1.384112e+10,-22.831387,212.956665,219.476512,223.690770,202.389217,196.938836,9.443165
3770,2025-10-20,216.479996,216.690002,213.589996,213.880005,38882800,219.558501,220.059640,-2.931457,-2.369201,...,33.085921,0.023651,1.387473e+10,17.739229,215.586665,219.202096,226.891433,206.068559,199.522844,9.538559


In [27]:
data_ind.columns

Index(['Price', 'Close', 'High', 'Low', 'Open', 'Volume', 'MA_20', 'EMA_20',
       'MACD', 'MACD_signal', 'ADX', 'Parabolic_SAR', 'Ichimoku_base',
       'Ichimoku_conversion', 'RSI', 'ROC', 'Stoch', 'CCI', 'Momentum',
       'Williams_%R', 'AO', 'ATR', 'BB_high', 'BB_low', 'KC_high', 'KC_low',
       'Donchian_high', 'Donchian_low', 'Chaikin_Volatility', 'OBV', 'VROC',
       'MFI', 'CMF', 'AD', 'EOM', 'Pivot_Point', 'VWAP', 'ATR_Bands_high',
       'ATR_Bands_low', 'SuperTrend', 'Ulcer_Index'],
      dtype='object')

In [30]:
def preprocess(df):
    # Renombrar la columna de fecha
    if 'Price' in df.columns:
        df = df.rename(columns={'Price': 'date'})
    
    # Convertir la columna 'date' a formato datetime usando dateparser
    df['date'] = df['date'].apply(lambda x: dateparser.parse(str(x)))
    
    # Eliminar filas con fechas inválidas
    df = df.dropna(subset=['date'])
    
    # Ordenar de más antiguo a más reciente
    df = df.sort_values(by='date', ascending=True).reset_index(drop=True)
    
    # Eliminar datos nulos en el resto de columnas
    df = df.dropna().reset_index(drop=True)
    
    # Hacer la fecha el índice temporal para estandarizar
    df = df.set_index('date')
    
    # Calcular los índices de split
    n = len(df)
    train_end = int(n * 0.6)
    val_end = int(n * 0.8)
    
    # Separar en train, validation y test
    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()
    
    # Calcular medias y desviaciones del train
    mean = train_df.mean()
    std = train_df.std(ddof=0)
    
    # Estandarizar con base a estadísticas del train
    train_scaled = (train_df - mean) / std
    val_scaled = (val_df - mean) / std
    test_scaled = (test_df - mean) / std
    
    # Volver a colocar 'date' como columna
    train_scaled = train_scaled.reset_index()
    val_scaled = val_scaled.reset_index()
    test_scaled = test_scaled.reset_index()
    
    # Devolver los tres datasets
    return train_scaled, val_scaled, test_scaled


In [31]:
train_scaled, val_scaled, test_scaled = preprocess(data_ind)

In [36]:
train_scaled

Unnamed: 0,date,Close,High,Low,Open,Volume,MA_20,EMA_20,MACD,MACD_signal,...,MFI,CMF,AD,EOM,Pivot_Point,VWAP,ATR_Bands_high,ATR_Bands_low,SuperTrend,Ulcer_Index
0,2010-12-09,-0.946410,-0.944064,-0.944386,-0.940876,0.130612,-0.947459,-0.944331,-0.157097,-0.171650,...,0.478611,-0.663146,-2.175254,-0.001683,-0.945004,-0.943006,-0.946221,-0.945785,-0.942821,-0.881913
1,2010-12-10,-0.945060,-0.947814,-0.945165,-0.945937,-0.279167,-0.946996,-0.943923,-0.163736,-0.172148,...,0.176836,-0.419511,-2.156919,-0.004109,-0.946073,-0.941978,-0.945346,-0.943919,-0.944560,-0.928174
2,2010-12-13,-0.947461,-0.944359,-0.944510,-0.943398,-0.015501,-0.946240,-0.943785,-0.177119,-0.175394,...,0.117364,-0.266887,-2.178756,0.015851,-0.945495,-0.941076,-0.947644,-0.946431,-0.942350,-0.913851
3,2010-12-14,-0.948005,-0.948144,-0.945643,-0.946988,-0.229580,-0.944912,-0.943712,-0.190521,-0.180842,...,0.235111,0.205329,-2.187996,-0.004833,-0.947324,-0.939932,-0.948532,-0.946597,-0.944377,-0.911885
4,2010-12-15,-0.945148,-0.942518,-0.944758,-0.947969,0.650080,-0.943343,-0.943371,-0.194424,-0.186031,...,0.077982,0.324608,-2.198743,0.021098,-0.944186,-0.939759,-0.945480,-0.943955,-0.941400,-0.900493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238,2019-10-31,1.861064,1.858098,1.884986,1.857940,-0.603804,1.850331,1.867640,-0.199434,-0.472877,...,-0.075786,0.762524,1.701927,0.765848,1.868062,1.859414,1.833020,1.890039,1.916028,0.562834
2239,2019-11-01,1.886969,1.867561,1.909301,1.878989,-0.599996,1.854901,1.871951,-0.096035,-0.398723,...,0.251462,0.500030,1.702273,0.399604,1.887909,1.864547,1.855312,1.919865,1.938225,0.537051
2240,2019-11-04,1.910139,1.898137,1.937281,1.901755,-0.607653,1.861254,1.878074,0.046413,-0.309089,...,0.235610,0.401388,1.693083,0.799528,1.915161,1.869443,1.875843,1.945896,1.972323,0.520778
2241,2019-11-05,1.904969,1.889785,1.924867,1.916028,-0.974551,1.869742,1.883118,0.139818,-0.217507,...,-0.094378,0.680867,1.692419,-0.473340,1.906529,1.870912,1.867584,1.944102,1.966785,0.510245
