# OBJETIVO #1

Construir un DataFrame con las columnas:

*Ticker

*Fecha

*Retorno

*Label (1 si retorno > +5% o < -5%, 0 si no)

In [None]:
import yfinance as yf
import pandas as pd


# PASO 1: ARMAR UNA FUNCIÓN QUE PROCESE UN SOLO TICKER



In [None]:
def procesar_ticker(ticker, start="2010-01-01", end="2024-08-30"):
    data = yf.download(ticker, start=start, end=end, progress=False)
    data['Ticker'] = ticker
    data['Retorno'] = data['Close'].pct_change() * 100
    data['Label'] = ((data['Retorno'] > 5) | (data['Retorno'] < -5)).astype(int)
    return data[['Ticker', 'Retorno', 'Label']].dropna()


# PASO 2: APLICAR ESTA FUNCIÓN A MULTIPLES TICKERS

In [None]:
tickers = ['ggal.ba', 'ypfd.ba', 'come.ba', 'pamp.ba', 'bma.ba']
df_total = pd.DataFrame()

for t in tickers:
    df_ticker = procesar_ticker(t)
    df_total = pd.concat([df_total, df_ticker])


In [None]:
df_total.head()

Price,Ticker,Retorno,Label
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2010-01-05,ggal.ba,2.75228,0
2010-01-06,ggal.ba,-4.017854,0
2010-01-07,ggal.ba,1.860475,0
2010-01-08,ggal.ba,-2.283115,0
2010-01-11,ggal.ba,-4.205599,0


In [None]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17875 entries, 2010-01-05 to 2024-08-29
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   (Ticker, )   17875 non-null  object 
 1   (Retorno, )  17875 non-null  float64
 2   (Label, )    17875 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 558.6+ KB


In [None]:
# Asegurarse de que las columnas no tengan MultiIndex
df_total.columns = [col[0] if isinstance(col, tuple) else col for col in df_total.columns]


In [None]:
df_total.head()

Unnamed: 0_level_0,Ticker,Retorno,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-05,ggal.ba,2.75228,0
2010-01-06,ggal.ba,-4.017854,0
2010-01-07,ggal.ba,1.860475,0
2010-01-08,ggal.ba,-2.283115,0
2010-01-11,ggal.ba,-4.205599,0


# DISTRIBUCION DE LAS CLASES

In [None]:
print(df_total['Label'].value_counts(normalize=True))


Label
0    0.897455
1    0.102545
Name: proportion, dtype: float64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df_total[['Retorno']]  # O incluir más features
y = df_total['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)


# AHORA AGREGO VARIABLES

In [None]:
df_total['SMA_5'] = df_total['Retorno'].rolling(window=5).mean()
df_total['SMA_10'] = df_total['Retorno'].rolling(window=10).mean()


df_total['Volatilidad_5d'] = df_total['Retorno'].rolling(window=5).std()
df_total['Volatilidad_10d'] = df_total['Retorno'].rolling(window=10).std()


df_total['Momentum_3d'] = df_total['Retorno'].rolling(window=3).sum()

df_total['Dia_semana'] = df_total.index.dayofweek  # Lunes = 0, Domingo = 6
df_total['Mes'] = df_total.index.month

df_total.head()

Price,Ticker,Retorno,Label,SMA_5,SMA_10,Volatilidad_5d,Volatilidad_10d,Momentum_3d,Dia_semana,Mes
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2010-01-05,ggal.ba,2.75228,0,,,,,,1,1
2010-01-06,ggal.ba,-4.017854,0,,,,,,2,1
2010-01-07,ggal.ba,1.860475,0,,,,,0.594901,3,1
2010-01-08,ggal.ba,-2.283115,0,,,,,-4.440494,4,1
2010-01-11,ggal.ba,-4.205599,0,-1.178762,,3.283742,,-4.628238,0,1


In [None]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17875 entries, 2010-01-05 to 2024-08-29
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   (Ticker, )           17875 non-null  object 
 1   (Retorno, )          17875 non-null  float64
 2   (Label, )            17875 non-null  int64  
 3   (SMA_5, )            17871 non-null  float64
 4   (SMA_10, )           17866 non-null  float64
 5   (Volatilidad_5d, )   17871 non-null  float64
 6   (Volatilidad_10d, )  17866 non-null  float64
 7   (Momentum_3d, )      17873 non-null  float64
 8   (Dia_semana, )       17875 non-null  int32  
 9   (Mes, )              17875 non-null  int32  
dtypes: float64(6), int32(2), int64(1), object(1)
memory usage: 1.4+ MB


In [None]:
df_total = df_total.dropna()
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17866 entries, 2010-01-18 to 2024-08-29
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   (Ticker, )           17866 non-null  object 
 1   (Retorno, )          17866 non-null  float64
 2   (Label, )            17866 non-null  int64  
 3   (SMA_5, )            17866 non-null  float64
 4   (SMA_10, )           17866 non-null  float64
 5   (Volatilidad_5d, )   17866 non-null  float64
 6   (Volatilidad_10d, )  17866 non-null  float64
 7   (Momentum_3d, )      17866 non-null  float64
 8   (Dia_semana, )       17866 non-null  int32  
 9   (Mes, )              17866 non-null  int32  
dtypes: float64(6), int32(2), int64(1), object(1)
memory usage: 1.4+ MB


In [None]:
X = df_total[['Retorno', 'SMA_5', 'SMA_10', 'Volatilidad_5d', 'Volatilidad_10d', 'Momentum_3d', 'Dia_semana', 'Mes']]
y = df_total['Label']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)

y_pred = modelo.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3216
           1       1.00      0.99      1.00       358

    accuracy                           1.00      3574
   macro avg       1.00      1.00      1.00      3574
weighted avg       1.00      1.00      1.00      3574



# GRIDSEARCH PARA TUNING DE HIPERPARAMETROS

Vamos a usar GridSearchCV para encontrar los mejores hiperparámetros de un RandomForestClassifier, con validación cruzada. Esto ayuda a prevenir el sobreajuste y a mejorar la generalización del modelo.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report




In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}


In [None]:
modelo = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=modelo,
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',  # O 'roc_auc' si te interesa la curva ROC
    n_jobs=-1,            # Usa todos los núcleos disponibles
    verbose=2
)


grid_search.fit(X, y)

print("Mejores hiperparámetros:")
print(grid_search.best_params_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
