In [30]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [31]:
data_train = pd.read_csv("./Hotel-Property-Value-Dataset/train_cleaned_diff.csv")
Y = data_train["HotelValue"]
X = data_train.drop("HotelValue", axis=1)

In [32]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [33]:
# --- Step 3: Handle Missing Values and Encode Categorical Variables ---
def preprocess_data(X, categorical_features):
    """
    Preprocess the data by handling missing values and encoding categorical variables
    Following course preprocessing concepts
    """
    X_processed = X.copy()

    for col in numeric_features:
        if col in X_processed.columns:
            median_val = X_processed[col].median()
            X_processed[col] = X_processed[col].fillna(median_val)

    # Handle categorical features - Label encoding
    label_encoders = {}
    for col in categorical_features:

        if col in X_processed.columns:
            # Fill missing values with mode (most frequent value)
            mode_val = X_processed[col].mode()[0] if not X_processed[col].mode().empty else 'Unknown'
            X_processed[col] = X_processed[col].fillna(mode_val)

            # Label encode categorical variables
            le = LabelEncoder()
            le.fit(X_processed[col])
            X_processed[col] = le.transform(X_processed[col])
            label_encoders[col] = le

    return X_processed, label_encoders

# Preprocess the data
X_processed, label_encoders = preprocess_data(
    X, categorical_features
)

print(f"\nAfter preprocessing:")
print(f"✅ Training data shape: {X_processed.shape}")
print(f"✅ Missing values in training data: {X_processed.isnull().sum().sum()}")



After preprocessing:
✅ Training data shape: (1113, 76)
✅ Missing values in training data: 0


In [34]:
X_processed.shape

(1113, 76)

In [35]:
# X_processed_train, X_processed_val, Y_train, Y_val = train_test_split(
#     X_processed, Y, test_size=0.2, random_state=42
# )

In [36]:
# #gridSearch for hyperparameter tuning
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': range(5, 21),
#     'min_samples_split': range(2, 11),
#     'min_samples_leaf': range(1, 11)
# }
# grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
#                            param_grid=param_grid,
#                            cv=3,
#                            n_jobs=-1,
#                            verbose=2)
# grid_search.fit(X_processed_train, Y_train)
# print(f"Best parameters found: {grid_search.best_params_}")

In [37]:
RFR = RandomForestRegressor(random_state=42, max_depth=14, min_samples_leaf=1, n_estimators=100)
RFR.fit(X_processed, Y)

# Different error metrics can be used to evaluate regression models
from sklearn.metrics import mean_squared_error
Y_pred = RFR.predict(X_processed)
mse = mean_squared_error(Y, Y_pred)
rmse = np.sqrt(mse)
print(f"\nModel Evaluation on Validation Set:")
print(f"✅ Mean Squared Error (MSE): {rmse}")


Model Evaluation on Validation Set:
✅ Mean Squared Error (MSE): 11053.516512117276


In [38]:
data_test = pd.read_csv("./Hotel-Property-Value-Dataset/test.csv")
X_test = data_test.copy()
X_test.drop(columns=["ServiceLaneType", "PoolQuality", "ExtraFacility", "BoundaryFence"], inplace=True)    
X_test_processed, _ = preprocess_data(
    X_test, categorical_features
)

In [39]:
X_test_processed.shape

(260, 76)

In [40]:
test_predictions = RFR.predict(X_test_processed)

In [41]:
test_predictions

array([141507.56190405, 331849.18666667, 112245.95200684, 158980.65964232,
       318478.51      ,  97848.97789377, 206126.30282051, 147015.17508111,
        99212.80479853, 132477.79123378, 152784.47485317, 120776.50686186,
       115663.94495644, 209367.17905021, 177952.31687054, 130366.56639825,
       195311.37401836, 137575.23714757, 110104.06425398, 202558.76087265,
       164431.19954545, 228285.53982353, 178815.58564227, 122667.02238445,
       190992.55077368, 168664.26367417, 180144.22914793, 107536.61638029,
       177154.5471696 , 189545.31454877, 118893.02948771, 252319.2790786 ,
       196261.96357143, 114984.34491266, 256184.93978022, 150816.18022932,
       138761.48983937, 208000.63272058, 310522.135     , 107086.57594092,
       118849.8148789 , 235129.06750877, 116526.51351781, 376962.955     ,
       133033.20037467, 145250.50863636, 113717.64179501, 127195.82127724,
       391701.98      , 144960.14166792, 122431.7836169 , 198678.11809302,
       119752.89611111, 3

In [42]:
# Create submission file using actual test IDs
submission_df = pd.DataFrame({
    'Id': data_test['Id'].values,  # Use actual IDs from test dataset
    'HotelValue': test_predictions  # Use HotelValue as per sample submission format
})

In [43]:
submission_df.to_csv("submission.csv", index=False)