In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint
import numpy as np
 
# Load the dataset
file_path = 'hotel_bookings.csv'  # Replace with your actual CSV file path
df = pd.read_csv(file_path)
 
# Drop irrelevant columns or columns with too many missing values
cols_to_drop = ['reservation_status', 'reservation_status_date', 'company']
df.drop([col for col in cols_to_drop if col in df.columns], axis=1, inplace=True)
 
# Handle missing values
df.fillna(0, inplace=True)
 
# Convert columns to appropriate data types
for col in df.select_dtypes(include=['object']).columns:
    try:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    except:
        pass
df.fillna(0, inplace=True)
 
# Convert all remaining non-numeric columns to strings (for categorical handling)
df = df.apply(lambda x: x.astype(str) if x.dtype == 'object' else x)
 
# Split features and target variable
X = df.drop('adr', axis=1)
 
# Transform regression target into a binary classification target
threshold = df['adr'].median()  # Use the median of adr as the threshold
y = (df['adr'] > threshold).astype(int)  # 1 if above the median, 0 otherwise
 
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
 
# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
 
# Define the classifier model (Random Forest with class weighting)
model = RandomForestClassifier(random_state=42, class_weight='balanced')
 
# Create a pipeline with the preprocessor and the classifier model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])
 
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# Define the parameter grid for Randomized Search
param_dist = {
    'classifier__n_estimators': randint(50, 300),
    'classifier__max_depth': randint(5, 50),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 10),
    'classifier__max_features': ['sqrt', 'log2', None]
}
 
# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, 
                                   cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
 
# Fit the Randomized Search model
random_search.fit(X_train, y_train)
 
# Predict on test data using the best estimator
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
 
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
 
# Print the metrics and the best parameters
print("\nBest Model Parameters:")
print(random_search.best_params_)
print("\nClassification Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
 
# Get feature importances from the Random Forest model
# Note: Feature importances apply to numerical and categorical features after encoding
rf_model = best_model.named_steps['classifier']
importances = rf_model.feature_importances_
 
# Get feature names after the preprocessing step
onehot_categories = best_model.named_steps['preprocessor'].transformers_[1][1].categories_
categorical_names = np.array([f'{categorical_features[i]}_{cat}' for i, cats in enumerate(onehot_categories) for cat in cats])
 
# Combine numerical and categorical feature names
feature_names = np.hstack([numerical_features, categorical_names])
 
# Print the most important features
print("\nTop 10 Important Features:")
important_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
for feature, importance in important_features[:10]:
    print(f"{feature}: {importance:.4f}")