In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.impute import SimpleImputer

## Load Datasets

In [36]:
train_data = pd.read_csv(r'D:\Downloads\dataset\train.csv')
test_data = pd.read_csv(r'D:\Downloads\dataset\test.csv')

# Explore data
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Timestamp                 14000 non-null  object 
 1   Residents                 14000 non-null  int64  
 2   Apartment_Type            13574 non-null  object 
 3   Temperature               13559 non-null  float64
 4   Humidity                  14000 non-null  object 
 5   Water_Price               14000 non-null  float64
 6   Period_Consumption_Index  14000 non-null  float64
 7   Income_Level              13574 non-null  object 
 8   Guests                    14000 non-null  int64  
 9   Amenities                 8003 non-null   object 
 10  Appliance_Usage           13585 non-null  float64
 11  Water_Consumption         14000 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 1.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000

## Data Pre-processing

In [37]:
# Handle non-numeric issues and exclude 'Timestamp'
for column in train_data.select_dtypes(include='object').columns:
    if column != 'Timestamp':  # Exclude Timestamp column
        train_data[column] = pd.to_numeric(train_data[column], errors='coerce')

for column in test_data.select_dtypes(include='object').columns:
    if column != 'Timestamp':  # Exclude Timestamp column
        test_data[column] = pd.to_numeric(test_data[column], errors='coerce')

# Drop columns with all missing values
train_data.dropna(axis=1, how='all', inplace=True)
test_data.dropna(axis=1, how='all', inplace=True)

# Ensure 'Water_Consumption' is dropped from test dataset
if 'Water_Consumption' in test_data.columns:
    test_data.drop(columns=['Water_Consumption'], inplace=True)

# Align columns between train and test datasets
train_data, test_data = train_data.align(test_data, join='left', axis=1)

# Impute missing values for numeric columns
numeric_columns_train = train_data.select_dtypes(include=np.number).columns
numeric_columns_test = test_data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
train_data[numeric_columns_train] = imputer.fit_transform(train_data[numeric_columns_train])
test_data[numeric_columns_test] = imputer.transform(test_data[numeric_columns_test])

# One-hot encode 'Apartment_Type' if it exists
if 'Apartment_Type' in train_data.columns:
    train_data = pd.get_dummies(train_data, columns=['Apartment_Type'], drop_first=True)
if 'Apartment_Type' in test_data.columns:
    test_data = pd.get_dummies(test_data, columns=['Apartment_Type'], drop_first=True)

# Align columns again after one-hot encoding
train_data, test_data = train_data.align(test_data, join='left', axis=1)

# Fill any missing values in test data with zero
test_data.fillna(0, inplace=True)

# Create feature sets
X = train_data.drop(columns=['Timestamp', 'Water_Consumption'])
y = train_data['Water_Consumption']
X_test = test_data.drop(columns=['Timestamp'])

# Ensure columns in X_test match X_train
X_test = X_test[X.columns]


## Train-validation split

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_val_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
score = max(0, 100 - rmse)
print(f"Validation Score: {score}")

Validation Score: 67.92834964858972


## Hyperparameter Tuning

In [39]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Validate the tuned model
y_val_pred_tuned = best_model.predict(X_val)
rmse_tuned = np.sqrt(mean_squared_error(y_val, y_val_pred_tuned))
score_tuned = max(0, 100 - rmse_tuned)
print(f"Tuned Validation Score: {score_tuned}")


Tuned Validation Score: 68.34882070671566


## Feature Importance Analysis

In [40]:
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(feature_importances)

Feature Importances:
                    Feature  Importance
0                 Residents    0.548047
4  Period_Consumption_Index    0.205745
3               Water_Price    0.106510
5                    Guests    0.049436
2                  Humidity    0.041510
1               Temperature    0.028438
6           Appliance_Usage    0.020314


## Generate predictions for submission

In [41]:
predictions = best_model.predict(X_test)
submission = pd.DataFrame({
    'Timestamp': test_data['Timestamp'],
    'Water_Consumption': predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully!")

Submission file 'submission.csv' created successfully!
