# Random Forest

### Preparamos las features

In [32]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import json

DATA_DIR = Path('../data/nlp-getting-started')
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
LOCATION_TO_COUNTRY_PATH = Path('../data/location_to_country.json')
RANDOM_SEED = 27

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

with open(LOCATION_TO_COUNTRY_PATH, 'r', encoding='utf-8') as f:
    location_to_country = json.load(f)

# Categóricas: 'country',  'keyword'. Después las voy a mean encodear.
train_df['country'] = train_df['location'].map(location_to_country).fillna('unknown')
test_df['country'] = test_df['location'].map(location_to_country).fillna('unknown')

train_df['keyword'] = train_df['keyword'].fillna('missing')
test_df['keyword'] = test_df['keyword'].fillna('missing')


categorical_features = ['country', 'keyword']

# Numéricas: 'text_length', 'num_hashtags', 'num_mentions', 'num_uppercase_per_word', 'sentiment_score', 'has_url'

# one hot encoding de 'has_url' a mano
train_df['has_url'] = train_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(int)
test_df['has_url'] = test_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(int)

train_df['text_length'] = train_df['text'].fillna('').str.len()
test_df['text_length'] = test_df['text'].fillna('').str.len()

train_df['num_hashtags'] = train_df['text'].str.count('#')
train_df['num_mentions'] = train_df['text'].str.count('@')

test_df['num_hashtags'] = test_df['text'].str.count('#')
test_df['num_mentions'] = test_df['text'].str.count('@')

def uppercase_per_word(text):
    text = str(text)

    # Palabras que tengan al menos una letra alfabética
    words = [w for w in text.split() if any(ch.isalpha() for ch in w)]
    if not words:
        return 0.0

    # Solo letras alfabéticas, para evitar que cuenten símbolos raros
    uppercase_letters = sum(ch.isupper() for ch in text if ch.isalpha())
    return uppercase_letters / len(words)


train_df['num_uppercase_per_word'] = train_df['text'].apply(uppercase_per_word)
test_df['num_uppercase_per_word']  = test_df['text'].apply(uppercase_per_word)

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if pd.isna(text) or text.strip() == '':
        return 0.5
    compound = analyzer.polarity_scores(text)['compound']
    return (compound + 1) / 2

train_df['sentiment_score'] = train_df['text'].apply(get_sentiment)
test_df['sentiment_score'] = test_df['text'].apply(get_sentiment)

numeric_features = ['text_length', 'num_hashtags', 'num_mentions', 'num_uppercase_per_word', 'sentiment_score', 'has_url']

embedding_feature = 'text'

# 1. Separar features y target
X = train_df[numeric_features + categorical_features + [embedding_feature]].copy()
y = train_df['target'].copy()

# 2. Split estratificado train/validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

X_train.head()

Unnamed: 0,text_length,num_hashtags,num_mentions,num_uppercase_per_word,sentiment_score,has_url,country,keyword,text
2721,87,1,0,1.0,0.18755,1,unknown,devastated,Obama declares disaster for typhoon-devastated...
2259,132,0,0,0.083333,0.55135,0,unknown,deluged,Businesses are deluged with invzices. Make you...
1815,136,0,0,1.333333,0.27205,1,United Kingdom,crashed,Neil_Eastwood77: I AM A KNOBHEAD!! Bin Laden f...
682,139,0,0,1.666667,0.5,1,unknown,blazing,Morgan Silver Dollar 1880 S Gem BU DMPL Cameo ...
7216,121,0,2,0.210526,0.78595,0,United States of America,weapons,@danagould @WaynesterAtl I agree with backgrou...


### Oki vamos a hacer encoding de las features categóricas.

In [33]:
from sklearn.model_selection import KFold

def kfold_target_encoding(train_series, target_series, n_splits=5, random_state=RANDOM_SEED):
    encoded = pd.Series(np.nan, index=train_series.index, dtype=float)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    global_mean = target_series.mean()

    for train_idx, val_idx in kf.split(train_series):
        fold_df = pd.DataFrame({
            'feature': train_series.iloc[train_idx],
            'target': target_series.iloc[train_idx]
        })
        means = fold_df.groupby('feature')['target'].mean()
        encoded.iloc[val_idx] = train_series.iloc[val_idx].map(means)

    encoded.fillna(global_mean, inplace=True)

    full_df = pd.DataFrame({'feature': train_series, 'target': target_series})
    mapping = full_df.groupby('feature')['target'].mean()

    return encoded, mapping, global_mean

mean_encoded_features = []

for column in ['country', 'keyword']:
    train_encoded, mapping, global_mean = kfold_target_encoding(
        X_train[column], y_train
    )
    encoded_col = f'{column}_target_mean'
    # agrego columnas nuevas a los splits
    X_train[encoded_col] = train_encoded
    X_val[encoded_col]   = X_val[column].map(mapping).fillna(global_mean)
    test_df[encoded_col] = test_df[column].map(mapping).fillna(global_mean)

    mean_encoded_features.append(encoded_col)


numeric_features = numeric_features + mean_encoded_features


X_train[numeric_features].head()

X_train_numeric = X_train[numeric_features].to_numpy()
X_val_numeric   = X_val[numeric_features].to_numpy()
X_test_numeric  = test_df[numeric_features].to_numpy()

Genial, ya tenemos todas las features listas para el random forrest. Me gustaría igual hacer un embedding del texto y reducirlo a pocas features, quizás le agrega al modelo...

In [None]:
from sklearn.decomposition import TruncatedSVD
N_COMPONENTS = 15 # Voy a tener que jugar un poco con este hiperparámetro

# Transformer para texto (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english'
)

X_train_text = tfidf_vectorizer.fit_transform(X_train[embedding_feature].fillna(''))
X_val_text = tfidf_vectorizer.transform(X_val[embedding_feature].fillna(''))
X_test_text = tfidf_vectorizer.transform(test_df[embedding_feature].fillna(''))

print(f"\nTF-IDF vectorizer:")
print(f"  Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"  Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10].tolist()}")


svd = TruncatedSVD(
    n_components=N_COMPONENTS,
    random_state=RANDOM_SEED
)

X_train_text_15 = svd.fit_transform(X_train_text)
X_val_text_15   = svd.transform(X_val_text)
X_test_text_15  = svd.transform(X_test_text)

X_train_combined = np.hstack([X_train_numeric, X_train_text_15])
X_val_combined   = np.hstack([X_val_numeric,   X_val_text_15])
X_test_combined  = np.hstack([X_test_numeric,  X_test_text_15])

print(f"  Test shape: {X_test_combined.shape}")

pd.DataFrame(X_train_combined).head()


TF-IDF vectorizer:
  Vocabulary size: 5000
  Feature names (first 10): ['0', '00', '00 http', '00 pm', '000', '01', '01 04', '02', '03', '04']
  Test shape: (3263, 23)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,87.0,1.0,0.0,1.0,0.18755,1.0,0.415596,0.5,0.130693,-0.023246,...,0.047581,0.032348,-0.023974,-0.005268,-0.085837,-0.130308,0.297432,-0.048629,0.426707,-0.115991
1,132.0,0.0,0.0,0.083333,0.55135,0.0,0.415596,0.357143,0.006938,0.004381,...,-0.016803,0.001901,-0.001933,-0.004619,0.003743,-0.011423,0.003899,-0.009453,0.006504,-0.004439
2,136.0,0.0,0.0,1.333333,0.27205,1.0,0.367647,0.666667,0.145825,-0.009459,...,-0.002862,-0.005137,-0.006146,-0.034278,0.018311,0.003717,-0.028955,-0.003576,-0.002582,-0.053259
3,139.0,0.0,0.0,1.666667,0.5,1.0,0.420105,0.0,0.226158,-0.021648,...,-0.042221,0.005437,-0.020891,-0.042579,0.035766,-0.020977,-0.044987,0.008722,-0.005189,-0.056172
4,121.0,0.0,2.0,0.210526,0.78595,0.0,0.428266,0.44,0.017488,0.026917,...,0.038122,-0.058757,0.02106,0.003295,-0.077684,0.156654,-0.025329,0.006817,0.016986,-0.033469


Bueno me cierra. A ver cómo sale el modelo con estas features. 

# RandomForest

## Hiper-parámetros

- n_estimators: cantidad de árboles a construir (100)

- max_depth: máxima profundidad de cada árbol

- min_samples_split: cantidad mínima de datos requeridos para splitear un nodo interno (2)  

- min_samples_leaf: cantidad mínima de datos requeridos para ser una hoja (1)

- max_features: cantidad de features a considerar cuando se busca el mejor split (n)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Modelo base
rf_base = RandomForestClassifier(
    random_state=RANDOM_SEED,
    n_jobs=-1,                   
    class_weight='balanced_subsample' 
)

# Grid de hiper-parámetros 
param_grid = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf':  [1, 2, 4],
    'max_features':    ['sqrt', 'log2']   
}

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='f1',       
    cv=3,               
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_combined, y_train)

print("Mejores hiper-parámetros encontrados:")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_

y_val_pred = best_rf.predict(X_val_combined)
f1_val = f1_score(y_val, y_val_pred)
print(f"F1 en validation: {f1_val:.4f}")


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Jajajaja bastante peor que mi regresión lineal. Vamos a ver de cambiar las features del embedding de texto, reducirlas a más dimensiones quizás ayude.

In [39]:
from sklearn.decomposition import TruncatedSVD
N_COMPONENTS = 100 # Voy a tener que jugar un poco con este hiperparámetro

# Transformer para texto (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english'
)

X_train_text = tfidf_vectorizer.fit_transform(X_train[embedding_feature].fillna(''))
X_val_text = tfidf_vectorizer.transform(X_val[embedding_feature].fillna(''))
X_test_text = tfidf_vectorizer.transform(test_df[embedding_feature].fillna(''))

print(f"\nTF-IDF vectorizer:")
print(f"  Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"  Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10].tolist()}")


svd = TruncatedSVD(
    n_components=N_COMPONENTS,
    random_state=RANDOM_SEED
)

X_train_text_15 = svd.fit_transform(X_train_text)
X_val_text_15   = svd.transform(X_val_text)
X_test_text_15  = svd.transform(X_test_text)

X_train_combined = np.hstack([X_train_numeric, X_train_text_15])
X_val_combined   = np.hstack([X_val_numeric,   X_val_text_15])
X_test_combined  = np.hstack([X_test_numeric,  X_test_text_15])

print(f"  Test shape: {X_test_combined.shape}")

pd.DataFrame(X_train_combined).head()


TF-IDF vectorizer:
  Vocabulary size: 5000
  Feature names (first 10): ['0', '00', '00 http', '00 pm', '000', '01', '01 04', '02', '03', '04']
  Test shape: (3263, 108)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,87.0,1.0,0.0,1.0,0.18755,1.0,0.415596,0.5,0.130693,-0.023248,...,0.005304,-0.013483,0.002326,0.002984,0.003938,-0.002332,0.005199,-0.013287,-0.010963,-0.001167
1,132.0,0.0,0.0,0.083333,0.55135,0.0,0.415596,0.357143,0.006938,0.004401,...,0.139461,0.024502,-0.080079,0.080185,-0.055549,0.19057,-0.029887,-0.03514,-0.001506,0.00681
2,136.0,0.0,0.0,1.333333,0.27205,1.0,0.367647,0.666667,0.145825,-0.009426,...,-0.035078,0.039258,0.034771,0.021395,0.048912,0.009592,0.013188,-0.009784,0.104209,-0.011875
3,139.0,0.0,0.0,1.666667,0.5,1.0,0.420105,0.0,0.226158,-0.021627,...,0.032511,0.001595,0.013292,0.003066,-0.022589,-0.022632,9e-05,0.016952,-0.027946,0.004261
4,121.0,0.0,2.0,0.210526,0.78595,0.0,0.428266,0.44,0.017488,0.026941,...,0.022296,0.003623,-0.012866,-0.047921,0.095409,0.052279,-0.00026,0.049052,-0.054937,0.017083


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Modelo base
rf_base = RandomForestClassifier(
    random_state=RANDOM_SEED,
    n_jobs=-1,                   
    class_weight='balanced_subsample' 
)

# Grid de hiper-parámetros (empezá con algo chico)
param_grid = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf':  [1, 2, 4],
    'max_features':    ['sqrt', 'log2']   
}

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='f1',       
    cv=3,               
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_combined, y_train)

print("Mejores hiper-parámetros encontrados:")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_

y_val_pred = best_rf.predict(X_val_combined)
f1_val = f1_score(y_val, y_val_pred)
print(f"F1 en validation: {f1_val:.4f}")


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   3.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Mucho peor. Overfitting a full. Voy a probar con otra lib en el embedding de texto. Entiendo que esta no capta contexto así que capaz con BERT anda mejor?

In [None]:
# ! pip install sentence-transformers
# ! pip install tf_keras

from sentence_transformers import SentenceTransformer

bert_model = SentenceTransformer('all-MiniLM-L6-v2')  # 384 dims

def encode_texts(text_series):
    texts = text_series.fillna('').tolist()
    embeddings = bert_model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True
    )
    return np.array(embeddings)

# Embeddings BERT para train / val / test
X_train_text_bert = encode_texts(X_train[embedding_feature])
X_val_text_bert   = encode_texts(X_val[embedding_feature])
X_test_text_bert  = encode_texts(test_df[embedding_feature])

print("Shapes BERT:")
print("  Train:", X_train_text_bert.shape)
print("  Val:  ", X_val_text_bert.shape)
print("  Test: ", X_test_text_bert.shape)


X_train_combined = np.hstack([X_train_numeric, X_train_text_bert])
X_val_combined   = np.hstack([X_val_numeric,   X_val_text_bert])
X_test_combined  = np.hstack([X_test_numeric,  X_test_text_bert])

Collecting tf_keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.20.1-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tf_keras
Successfully installed tf_keras-2.20.1


Batches:   0%|          | 0/191 [00:00<?, ?it/s]

Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Shapes BERT:
  Train: (6090, 384)
  Val:   (1523, 384)
  Test:  (3263, 384)


A ver como nos va ahora con RF

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Modelo base
rf_base = RandomForestClassifier(
    random_state=RANDOM_SEED,
    n_jobs=-1,                   
    class_weight='balanced_subsample' 
)

# Grid de hiper-parámetros 
param_grid = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf':  [1, 2, 4],
    'max_features':    ['sqrt', 'log2']   
}

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='f1',       
    cv=3,               
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_combined, y_train)

print("Mejores hiper-parámetros encontrados:")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_

y_val_pred = best_rf.predict(X_val_combined)
f1_val = f1_score(y_val, y_val_pred)
print(f"F1 en validation: {f1_val:.4f}")


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   2.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Cada vez peor! Probemos sin hacer el embedding de texto, capaz que ahí mejora. solamente porque al aumentar las dimensiones está empeorando fuerte...

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Modelo base
rf_base = RandomForestClassifier(
    random_state=RANDOM_SEED,
    n_jobs=-1,                   
    class_weight='balanced_subsample' 
)

# Grid de hiper-parámetros 
param_grid = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf':  [1, 2, 4],
    'max_features':    ['sqrt', 'log2']   
}

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='f1',       
    cv=3,               
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_numeric, y_train)

print("Mejores hiper-parámetros encontrados:")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_

y_val_pred = best_rf.predict(X_val_numeric)
f1_val = f1_score(y_val, y_val_pred)
print(f"F1 en validation: {f1_val:.4f}")


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Cada vez peor! Quizás Random Forest no es el mejor modelo para este problema. Voy a probar con XGBoost en el notebook 4.