In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

# Cargar datos
df = pd.read_csv('/content/hotel_bookings.csv')

# Preprocesamiento
# Manejar valores nulos
df['children'] = df['children'].fillna(0)
df['country'] = df['country'].fillna('Unknown')
df['agent'] = df['agent'].fillna(0)
df['company'] = df['company'].fillna(0)

# Codificar variables categóricas
le = LabelEncoder()
categorical_cols = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
                   'distribution_channel', 'reserved_room_type', 'assigned_room_type',
                   'deposit_type', 'customer_type']

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Seleccionar features relevantes
features = [
    'hotel', 'lead_time', 'arrival_date_month', 'arrival_date_week_number',
    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
    'meal', 'country', 'market_segment', 'distribution_channel',
    'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled',
    'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type',
    'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
    'total_of_special_requests'
]

# Preparar variables para el modelo
X = df[features]
y = df['is_canceled']

# Verificar la distribución de la variable objetivo
print("Distribución de cancelaciones:")
print(df['is_canceled'].value_counts(normalize=True))

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Escalar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Guardar el scaler para uso futuro
with open("hotel_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Modelo 1: Random Forest
print("\nEntrenando Random Forest...")
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    random_state=42
)
rf.fit(X_train_scaled, y_train)

# Modelo 2: Gradient Boosting
print("\nEntrenando Gradient Boosting...")
gb = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
gb.fit(X_train_scaled, y_train)

# Evaluación de los modelos
print("\nEvaluando modelos...")

# Random Forest
rf_predictions = rf.predict(X_test_scaled)
print("\nRandom Forest:")
print(classification_report(y_test, rf_predictions))
print("Accuracy:", accuracy_score(y_test, rf_predictions))

# Gradient Boosting
gb_predictions = gb.predict(X_test_scaled)
print("\nGradient Boosting:")
print(classification_report(y_test, gb_predictions))
print("Accuracy:", accuracy_score(y_test, gb_predictions))

# Feature importance para Random Forest
feature_importance_rf = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 características más importantes (Random Forest):")
print(feature_importance_rf.head(10))

# Guardar los modelos entrenados
print("\nGuardando modelos...")

# Guardar Random Forest
with open("hotel_rf_model.pkl", "wb") as f:
    pickle.dump(rf, f)

# Guardar Gradient Boosting
with open("hotel_gb_model.pkl", "wb") as f:
    pickle.dump(gb, f)

# Guardar lista de features para referencia futura
with open("hotel_features.pkl", "wb") as f:
    pickle.dump(features, f)

print("\nModelos guardados como 'hotel_rf_model.pkl' y 'hotel_gb_model.pkl'")
print("Scaler guardado como 'hotel_scaler.pkl'")
print("Lista de features guardada como 'hotel_features.pkl'")

# Crear función de predicción para uso futuro
def predict_cancellation(data, model_path="hotel_rf_model.pkl", scaler_path="hotel_scaler.pkl"):
    """
    Función para hacer predicciones con el modelo guardado
    """
    # Cargar modelo y scaler
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    with open(scaler_path, 'rb') as f:
        scaler = pickle.load(f)

    # Escalar datos
    data_scaled = scaler.transform(data)

    # Hacer predicción
    prediction = model.predict(data_scaled)
    probability = model.predict_proba(data_scaled)

    return prediction, probability

# Ejemplo de uso de la función de predicción
print("\nEjemplo de predicción con una muestra:")
sample_data = X_test.iloc[[0]]
prediction, probability = predict_cancellation(sample_data)
print(f"Predicción: {'Cancelado' if prediction[0] == 1 else 'No Cancelado'}")
print(f"Probabilidad de cancelación: {probability[0][1]:.2%}")

Distribución de cancelaciones:
is_canceled
0    0.629584
1    0.370416
Name: proportion, dtype: float64

Entrenando Random Forest...

Entrenando Gradient Boosting...

Evaluando modelos...

Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     22550
           1       0.87      0.76      0.81     13267

    accuracy                           0.87     35817
   macro avg       0.87      0.85      0.86     35817
weighted avg       0.87      0.87      0.87     35817

Accuracy: 0.8693078705642572

Gradient Boosting:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89     22550
           1       0.84      0.76      0.80     13267

    accuracy                           0.86     35817
   macro avg       0.85      0.84      0.84     35817
weighted avg       0.86      0.86      0.86     35817

Accuracy: 0.8574140771142195

Top 10 características más importantes (Random Forest):
       

In [None]:
!pip install streamlit

In [None]:
import streamlit as st
import pickle
import pandas as pd
import numpy as np

# Cargar los modelos y el scaler
@st.cache_resource
def load_models():
    with open('hotel_rf_model.pkl', 'rb') as rf:
        rf_model = pickle.load(rf)
    with open('hotel_gb_model.pkl', 'rb') as gb:
        gb_model = pickle.load(gb)
    with open('hotel_scaler.pkl', 'rb') as sc:
        scaler = pickle.load(sc)
    with open('hotel_features.pkl', 'rb') as feat:
        features = pickle.load(feat)
    return rf_model, gb_model, scaler, features

# Función para hacer predicciones
def predict_cancellation(data, model, scaler):
    data_scaled = scaler.transform(data)
    prediction = model.predict(data_scaled)
    probability = model.predict_proba(data_scaled)
    return prediction[0], probability[0]

def main():
    # Título de la aplicación
    st.title('Predicción de Cancelaciones de Reservas Hoteleras')

    # Cargar modelos
    rf_model, gb_model, scaler, features = load_models()

    # Sidebar
    st.sidebar.header('Parámetros de la Reserva')

    def user_input_parameters():
        # Parámetros básicos
        hotel = st.sidebar.selectbox('Tipo de Hotel', ['Resort Hotel', 'City Hotel'], index=0)
        hotel_encoded = 0 if hotel == 'Resort Hotel' else 1

        lead_time = st.sidebar.slider('Tiempo de anticipación (días)', 0, 365, 30)

        arrival_month = st.sidebar.selectbox('Mes de llegada',
            ['January', 'February', 'March', 'April', 'May', 'June',
             'July', 'August', 'September', 'October', 'November', 'December'])
        month_encoded = {'January': 0, 'February': 1, 'March': 2, 'April': 3,
                        'May': 4, 'June': 5, 'July': 6, 'August': 7,
                        'September': 8, 'October': 9, 'November': 10, 'December': 11}[arrival_month]

        # Parámetros de estancia
        st.sidebar.subheader('Detalles de la Estancia')
        weekend_nights = st.sidebar.number_input('Noches de fin de semana', 0, 7, 1)
        week_nights = st.sidebar.number_input('Noches entre semana', 0, 14, 2)

        # Información del huésped
        st.sidebar.subheader('Información del Huésped')
        adults = st.sidebar.number_input('Número de adultos', 1, 4, 2)
        children = st.sidebar.number_input('Número de niños', 0, 3, 0)

        # Detalles de la reserva
        st.sidebar.subheader('Detalles de la Reserva')
        meal = st.sidebar.selectbox('Tipo de comida',
                                  ['BB (Bed & Breakfast)', 'HB (Half Board)', 'FB (Full Board)', 'No Meal'])
        meal_encoded = {'BB (Bed & Breakfast)': 0, 'HB (Half Board)': 1,
                       'FB (Full Board)': 2, 'No Meal': 3}[meal]

        market_segment = st.sidebar.selectbox('Segmento de mercado',
                                            ['Direct', 'Corporate', 'Online TA', 'Offline TA/TO', 'Groups', 'Aviation'])
        market_encoded = {'Direct': 0, 'Corporate': 1, 'Online TA': 2,
                         'Offline TA/TO': 3, 'Groups': 4, 'Aviation': 5}[market_segment]

        repeated_guest = st.sidebar.checkbox('Cliente repetido')
        parking_spaces = st.sidebar.number_input('Espacios de parking requeridos', 0, 3, 0)
        special_requests = st.sidebar.number_input('Número de pedidos especiales', 0, 5, 0)

        # Crear diccionario con todos los datos
        data = {
            'hotel': hotel_encoded,
            'lead_time': lead_time,
            'arrival_date_month': month_encoded,
            'arrival_date_week_number': 1,  # valor por defecto
            'stays_in_weekend_nights': weekend_nights,
            'stays_in_week_nights': week_nights,
            'adults': adults,
            'children': children,
            'meal': meal_encoded,
            'country': 0,  # valor por defecto
            'market_segment': market_encoded,
            'distribution_channel': market_encoded,  # simplificación
            'is_repeated_guest': int(repeated_guest),
            'previous_cancellations': 0,
            'previous_bookings_not_canceled': 0,
            'reserved_room_type': 0,
            'assigned_room_type': 0,
            'booking_changes': 0,
            'deposit_type': 0,
            'days_in_waiting_list': 0,
            'adr': 100,  # valor por defecto
            'required_car_parking_spaces': parking_spaces,
            'total_of_special_requests': special_requests
        }

        features_df = pd.DataFrame(data, index=[0])
        return features_df

    # Capturar parámetros del usuario
    df = user_input_parameters()

    # Selección del modelo
    model_option = st.sidebar.selectbox(
        'Selecciona el modelo:',
        ['Random Forest', 'Gradient Boosting']
    )

    # Mostrar parámetros de entrada
    st.subheader('Detalles de la Reserva')
    st.write(df)

    # Predicción
    if st.button('Predecir Cancelación'):
        model = rf_model if model_option == 'Random Forest' else gb_model
        prediction, probability = predict_cancellation(df, model, scaler)

        # Mostrar resultado
        st.subheader('Resultado de la Predicción')

        # Crear columnas para organizar la visualización
        col1, col2 = st.columns(2)

        with col1:
            if prediction == 1:
                st.error('⚠️ Alta probabilidad de cancelación')
            else:
                st.success('✅ Baja probabilidad de cancelación')

        with col2:
            st.write(f'Probabilidad de cancelación: {probability[1]:.2%}')

        # Mostrar gráfico de probabilidad
        import plotly.graph_objects as go

        fig = go.Figure(go.Indicator(
            mode = "gauge+number",
            value = probability[1] * 100,
            domain = {'x': [0, 1], 'y': [0, 1]},
            title = {'text': "Probabilidad de Cancelación"},
            gauge = {
                'axis': {'range': [None, 100]},
                'steps': [
                    {'range': [0, 30], 'color': "lightgreen"},
                    {'range': [30, 70], 'color': "yellow"},
                    {'range': [70, 100], 'color': "red"}],
                'threshold': {
                    'line': {'color': "red", 'width': 4},
                    'thickness': 0.75,
                    'value': 70}}))

        st.plotly_chart(fig)

        # Factores de riesgo
        st.subheader('Factores de Riesgo')
        risk_factors = []
        if df['lead_time'].values[0] > 100:
            risk_factors.append("- Reserva realizada con mucha anticipación")
        if df['stays_in_weekend_nights'].values[0] + df['stays_in_week_nights'].values[0] > 7:
            risk_factors.append("- Estancia prolongada")
        if not df['is_repeated_guest'].values[0]:
            risk_factors.append("- Cliente nuevo")

        if risk_factors:
            st.write("Factores que pueden aumentar el riesgo de cancelación:")
            for factor in risk_factors:
                st.write(factor)

if __name__ == '__main__':
    main()

2024-11-21 00:14:43.186 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-11-21 00:14:43.715 Session state does not function when running a script without `streamlit run`
