In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e1/sample_submission.csv
/kaggle/input/playground-series-s3e1/train.csv
/kaggle/input/playground-series-s3e1/test.csv


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [5]:
traindata=pd.read_csv("/kaggle/input/playground-series-s3e1/train.csv")
testdata=pd.read_csv('/kaggle/input/playground-series-s3e1/test.csv')
traindata.head(5)

Unnamed: 0,id,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98
1,1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946
2,2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576
3,3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336
4,4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5


In [6]:
traindata.isna().sum()

id             0
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [7]:
X=traindata.drop(['id','MedHouseVal'],axis=1)
y=traindata['MedHouseVal']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Function to evaluate and print results
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

    return {'MSE': mse, 'MAE': mae, 'R2': r2}

In [11]:
# Random Forest Regressor
RandomForestModel = RandomForestRegressor(random_state=42)
RandomForestModel.fit(X_train_scaled, y_train)
RF_y_pred = RandomForestModel.predict(X_test_scaled)
rf_results = evaluate_model(y_test, RF_y_pred, "Random Forest")


Random Forest:
Mean Squared Error: 0.3782
Mean Absolute Error: 0.4276
R2 Score: 0.7218


In [12]:
# Gradient Boosting Regressor
GradientBoostingModel = GradientBoostingRegressor(random_state=42)
GradientBoostingModel.fit(X_train_scaled, y_train)
GB_y_pred = GradientBoostingModel.predict(X_test_scaled)
gb_results = evaluate_model(y_test, GB_y_pred, "Gradient Boosting")


Gradient Boosting:
Mean Squared Error: 0.3870
Mean Absolute Error: 0.4404
R2 Score: 0.7153


In [13]:
# Decision Tree Regressor
DecisionTreeModel = DecisionTreeRegressor(random_state=42)
DecisionTreeModel.fit(X_train_scaled, y_train)
DT_y_pred = DecisionTreeModel.predict(X_test_scaled)
dt_results = evaluate_model(y_test, DT_y_pred, "Decision Tree")



Decision Tree:
Mean Squared Error: 0.7101
Mean Absolute Error: 0.5819
R2 Score: 0.4777


In [14]:
# K-Nearest Neighbors Regressor
KNNModel = KNeighborsRegressor()
KNNModel.fit(X_train_scaled, y_train)
KNN_y_pred = KNNModel.predict(X_test_scaled)
knn_results = evaluate_model(y_test, KNN_y_pred, "KNN")


KNN:
Mean Squared Error: 0.5212
Mean Absolute Error: 0.5106
R2 Score: 0.6166


In [23]:
best_params = {
    'colsample_bytree': 0.7367518666865607,
    'gamma': 0.04589953290672094,
    'learning_rate': 0.03824709648056803,
    'max_depth': 8,
    'min_child_weight': 4,
    'n_estimators': 303,
    'reg_alpha': 0.0017161101831750236,
    'reg_lambda': 1.6923737499050842,
    'subsample': 0.9227651908203118
}

In [26]:
# XGBoost Regressor
XGBoostModel = XGBRegressor(random_state=42,**best_params)
XGBoostModel.fit(X_train, y_train)
XGB_y_pred = XGBoostModel.predict(X_test)
xgb_results = evaluate_model(y_test, XGB_y_pred, "XGBoost")


XGBoost:
Mean Squared Error: 0.3461
Mean Absolute Error: 0.4094
R2 Score: 0.7455


In [28]:
# CatBoost Regressor
CatBoostModel = CatBoostRegressor(random_state=42, verbose=0)
CatBoostModel.fit(X_train, y_train)
CatBoost_y_pred = CatBoostModel.predict(X_test)
catboost_results = evaluate_model(y_test, CatBoost_y_pred, "CatBoost")


CatBoost:
Mean Squared Error: 0.3472
Mean Absolute Error: 0.4097
R2 Score: 0.7446


In [17]:
# LightGBM model
lgbm_model = lgb.LGBMRegressor()
lgbm_model.fit(X_train_scaled, y_train)
lgbm_y_pred = lgbm_model.predict(X_test_scaled)
lgbm_results = evaluate_model(y_test, lgbm_y_pred, "lgbm_model")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 29709, number of used features: 8
[LightGBM] [Info] Start training from score 2.077812

lgbm_model:
Mean Squared Error: 0.3525
Mean Absolute Error: 0.4138
R2 Score: 0.7407


In [18]:
# Voting Regressor
VotingModel = VotingRegressor([
    ('rf', RandomForestModel),
    ('gb', GradientBoostingModel),
    ('dt', DecisionTreeModel),
    ('knn', KNNModel),
    ('xgb', XGBoostModel),
    ('catboost', CatBoostModel),
    ('lgbm', lgbm_model)
])
VotingModel.fit(X_train_scaled, y_train)
Voting_y_pred = VotingModel.predict(X_test_scaled)
voting_results = evaluate_model(y_test, Voting_y_pred, "Voting Regressor")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004832 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 29709, number of used features: 8
[LightGBM] [Info] Start training from score 2.077812

Voting Regressor:
Mean Squared Error: 0.3629
Mean Absolute Error: 0.4201
R2 Score: 0.7331


In [29]:
test_predictions = CatBoostModel.predict(testdata.drop(['id'], axis=1))
# Create submission file
submission = pd.DataFrame({'id': testdata['id'], 'MedHouseVal': test_predictions})
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created successfully!")


Submission file created successfully!
