In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


Read the datasets

In [3]:
trainData = pd.read_csv('../Hotel-Property-Value-Dataset/train.csv')
testData = pd.read_csv('../Hotel-Property-Value-Dataset/test.csv')
testId = testData['Id']
testData.drop(['Id'], axis=1, inplace=True)

trainData.drop(['Id'], axis=1, inplace = True)

Preprocessing null values

In [4]:
numeric_features = []
catogoricat_features = []
for col in trainData.columns[1:-1]:
    if (trainData[col].dropna().nunique() <= 25):
      catogoricat_features.append(col)
    else:
      numeric_features.append(col)

def Preprocess_null_values(df):
  # Numeric vs Categorical columns
  numeric_features = []
  catogoricat_features = []
  for col in df.columns:
    if (df[col].dropna().nunique() <= 25):
      catogoricat_features.append(col)
    else:
      numeric_features.append(col)

  # null values
  for col in numeric_features:
    df[col].fillna(df[col].median(), inplace=True)
  for col in catogoricat_features:
    if (df[col].isna().sum() > 200):
      if (df[col].dtype == 'object'):
        df[col].fillna('default', inplace=True)
      else:
        df[col].fillna(df[col].dropna().min() - 1, inplace=True)
    else:
      df[col].fillna(df[col].dropna().mode()[0], inplace=True)
  return df

trainData = trainData[~trainData['HotelValue'].isna()]
X = trainData.drop(['HotelValue'], axis=1)
y = trainData[['HotelValue']]

X = Preprocess_null_values(X)
testData = Preprocess_null_values(testData)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

EDA

In [5]:
# for col in numeric_features:
#   if (X[col].corr(y['HotelValue']) >= 0.5):
#     sns.scatterplot(x=X[col], y=y['HotelValue'])
#     plt.show()


# Removing outliers
# Combine X and y back
temp = pd.concat([X, y], axis=1)

# Remove outliers
condition = ~(
    (temp['HotelValue'] < 300000) &
    (
        (temp['BasementTotalSF'] > 3000) |
        (temp['GroundFloorArea'] > 3000) |
        (temp['UsableArea'] > 4000) |
        (temp['ParkingArea'] > 1200)
    )
)

temp = temp[condition]  # Keep only non-outlier rows

# Reassign X and y
X = temp.drop('HotelValue', axis=1)
y = temp[['HotelValue']]

Merging highly correlated featurs

Finding out highly correlated features

In [6]:
def filter_high_correlations(corr_matrix: pd.DataFrame, threshold: float = 0.8) -> list:
    """
    Identifies pairs of columns in a correlation matrix where the absolute
    correlation coefficient is greater than a specified threshold.

    Args:
        corr_matrix (pd.DataFrame): The input correlation matrix.
        threshold (float): The absolute correlation value to filter by (default is 0.8).

    Returns:
        list: A list of tuples, where each tuple contains (Column A, Column B, Correlation Value).
    """
    # Create an empty list to store the highly correlated pairs
    highly_correlated_pairs = []

    # Get the column names for easy access
    columns = corr_matrix.columns

    # Iterate over the upper triangle of the matrix to avoid redundant checks (A vs B is same as B vs A)
    # and to exclude the diagonal (A vs A, which is always 1)
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col_a = columns[i]
            col_b = columns[j]
            correlation_value = corr_matrix.iloc[i, j]

            # Check if the absolute correlation is above the threshold
            if abs(correlation_value) > threshold:
                highly_correlated_pairs.append((col_a, col_b))

    return highly_correlated_pairs

correlation_matrix = X.select_dtypes(exclude=['object']).corr()
high_correlations = filter_high_correlations(correlation_matrix)
for i in high_correlations:
  print(i)

('BasementTotalSF', 'GroundFloorArea')
('UsableArea', 'TotalRooms')
('ParkingCapacity', 'ParkingArea')


Merging the highly correlated features into one

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def merge_high_correlations(df, high_correlations):
    df = df.copy()  # avoid modifying original
    scaler = StandardScaler()
    pca = PCA(n_components=1)
    
    for pair in high_correlations:
        cols = list(pair)
        
        # Scale the two columns
        scaled_data = scaler.fit_transform(df[cols])
        
        # Apply PCA to get a single combined feature
        df[pair[0] + '_' + pair[1]] = pca.fit_transform(scaled_data)
        
        # Drop original columns
        df = df.drop(cols, axis=1)
        
    return df

X = merge_high_correlations(X, high_correlations)
testData = merge_high_correlations(testData, high_correlations)


Log Transformations

In [8]:
numeric_features = []
catogoricat_features = []
for col in X.columns:
    if (X[col].dropna().nunique() <= 25):
      catogoricat_features.append(col)
    else:
      numeric_features.append(col)

for col in numeric_features:
  if (X[col].skew() > 1):
    X[col] = np.log1p(X[col])
    testData[col] = np.log1p(testData[col])

y['HotelValue'] = np.log1p(y['HotelValue'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['HotelValue'] = np.log1p(y['HotelValue'])


Encoding Categorical Data using ohe

In [9]:
# Step 1: Merge train X and test data for consistent encoding
merged_df = pd.concat([X, testData], axis=0)

# Step 2: Apply One-Hot Encoding only on object-type columns
merged_df = pd.get_dummies(merged_df, columns=merged_df.select_dtypes(include=['object']).columns,
                           drop_first=True)

# Step 3: Split back into train and test
X = merged_df.iloc[:X.shape[0], :]
testData = merged_df.iloc[X.shape[0]:, :]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = scaler.fit_transform(X)
testData = scaler.fit_transform(testData)

Train test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Testing Models

Voting BayesianRidge and normal Ridge

In [11]:
from sklearn.metrics import root_mean_squared_error # Changed from accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor

bayesian = BaggingRegressor(
    estimator=BayesianRidge(),
    max_features=0.9,
    max_samples=0.8,
    n_estimators=100,
    random_state=42
)
ridge = BaggingRegressor(
    estimator=Ridge(alpha=2),
    max_features=0.9,
    max_samples=0.6,
    n_estimators=250,
    random_state=42
)

voting_reg = VotingRegressor(
    estimators=[
        ('ridge', ridge),
        ('bayesian', bayesian)
    ],
)

param_grid = {
    'weights': [[1, 0], [0.6, 0.4], [0.55, 0.45], [0.5, 0.5], [0.45, 0.55], [0.4, 0.6], [0, 1]],
}

# RMSE scorer with inverse log transform
rmse_scorer = make_scorer(
    lambda y_true, y_pred: root_mean_squared_error(np.expm1(y_pred), np.expm1(y_true)),
    greater_is_better=False
)

grid_search = GridSearchCV(estimator=voting_reg, param_grid=param_grid, scoring=rmse_scorer, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y['HotelValue'])

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (RMSE):", -score)

y_pred = best_model.predict(testData)
y_pred = np.expm1(y_pred)
output = pd.DataFrame({'Id': testId, 'HotelValue': y_pred})
output.to_csv('Voting.csv', index=False)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END .................................weights=[0.6, 0.4]; total time=   4.4s
[CV] END .....................................weights=[1, 0]; total time=   4.4s
[CV] END .....................................weights=[1, 0]; total time=   4.5s
[CV] END .....................................weights=[1, 0]; total time=   4.5s
[CV] END .....................................weights=[1, 0]; total time=   4.5s
[CV] END .....................................weights=[1, 0]; total time=   4.5s
[CV] END .................................weights=[0.6, 0.4]; total time=   4.5s
[CV] END .................................weights=[0.6, 0.4]; total time=   4.6s
[CV] END .................................weights=[0.6, 0.4]; total time=   4.5s
[CV] END .................................weights=[0.6, 0.4]; total time=   4.6s
[CV] END ...............................weights=[0.55, 0.45]; total time=   4.5s
[CV] END ...............................weights=[