In [35]:
# --- Importaciones ---
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import joblib

# --- Cargar dataset procesado ---
df = pd.read_csv(r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\reservas_hoteles_fe.csv")

In [36]:
# --- Separar features y target ---
X = df.drop(['Booking_ID','booking_status'], axis=1)
y = df['booking_status']

In [37]:
X

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 2,room_type_reserved_Room_Type 3,room_type_reserved_Room_Type 4,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online
0,2,0,1,2,0,224,2017,10,2,0,...,False,False,False,False,False,False,False,False,True,False
1,2,0,2,3,0,5,2018,11,6,0,...,False,False,False,False,False,False,False,False,False,True
2,1,0,2,1,0,1,2018,2,28,0,...,False,False,False,False,False,False,False,False,False,True
3,2,0,0,2,0,211,2018,5,20,0,...,False,False,False,False,False,False,False,False,False,True
4,2,0,1,1,0,48,2018,4,11,0,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,0,2,6,0,85,2018,8,3,0,...,False,False,True,False,False,False,False,False,False,True
36271,2,0,1,3,0,228,2018,10,17,0,...,False,False,False,False,False,False,False,False,False,True
36272,2,0,2,6,0,148,2018,7,1,0,...,False,False,False,False,False,False,False,False,False,True
36273,2,0,0,3,0,63,2018,4,21,0,...,False,False,False,False,False,False,False,False,False,True


In [38]:
y

0        0
1        0
2        1
3        1
4        1
        ..
36270    0
36271    1
36272    0
36273    1
36274    0
Name: booking_status, Length: 36275, dtype: int64

In [39]:
# --- Dividir en train/test ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [40]:
# --- Previsualización de train/test ---
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts(normalize=True))
print("y_test distribution:\n", y_test.value_counts(normalize=True))

X_train shape: (29020, 31)
X_test shape: (7255, 31)
y_train distribution:
 booking_status
0    0.672364
1    0.327636
Name: proportion, dtype: float64
y_test distribution:
 booking_status
0    0.672364
1    0.327636
Name: proportion, dtype: float64


In [41]:
# --- Columnas numéricas a escalar ---
numeric_cols = [
    'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
    'no_of_week_nights', 'required_car_parking_space', 'lead_time',
    'arrival_year', 'arrival_month', 'arrival_date',
    'repeated_guest', 'no_of_previous_cancellations',
    'no_of_previous_bookings_not_canceled', 'avg_price_per_room',
    'total_guests', 'stay_duration'
]


In [42]:
# --- Escalado
scaler = StandardScaler()

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [43]:
# --- Definir modelos ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

In [44]:
# --- Entrenar y evaluar métricas ---
for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    y_train_proba = model.predict_proba(X_train)[:,1]
    y_test_proba = model.predict_proba(X_test)[:,1]
    
    results[name] = {
        'train_accuracy': accuracy_score(y_train, y_train_pred),
        'test_accuracy': accuracy_score(y_test, y_test_pred),
        'train_f1': f1_score(y_train, y_train_pred),
        'test_f1': f1_score(y_test, y_test_pred),
        'train_roc': roc_auc_score(y_train, y_train_proba),
        'test_roc': roc_auc_score(y_test, y_test_proba)
    }

results_df = pd.DataFrame(results).T
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                    train_accuracy  test_accuracy  train_f1   test_f1  \
LogisticRegression        0.807340       0.816678  0.682672  0.698686   
RandomForest              0.994142       0.902688  0.991032  0.844835   
XGBoost                   0.919056       0.895107  0.872261  0.833807   

                    train_roc  test_roc  
LogisticRegression   0.862625  0.871665  
RandomForest         0.999390  0.956783  
XGBoost              0.976616  0.954542  


In [45]:
# --- Optimización RandomForest con GridSearchCV ---
param_grid = {
    'n_estimators':[100,200],
    'max_depth':[None,10,20],
    'min_samples_split':[2,5]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_
print("Mejores parámetros RandomForest:", grid.best_params_)

Mejores parámetros RandomForest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


In [46]:
# --- Guardar splits, scaler y modelo ---
X_train.to_csv(r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\X_train.csv", index=False)
X_test.to_csv(r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\X_test.csv", index=False)
y_train.to_csv(r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\y_train.csv", index=False)
y_test.to_csv(r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\y_test.csv", index=False)

joblib.dump(best_rf, r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\best_rf_model.pkl")
joblib.dump(scaler, r"C:\Users\Luis Arbio\Documents\Hoteles\data\Processed\scaler.pkl")

['C:\\Users\\Luis Arbio\\Documents\\Hoteles\\data\\Processed\\scaler.pkl']