In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [21]:
train = pd.read_csv('/content/Train Dataset 2.csv')
test = pd.read_csv('/content/Test Dataset 2.csv')

In [22]:
X = train.drop(['house_id', 'target_price'], axis=1)
y = train['target_price']
X_test = test.drop(['house_id'], axis=1)
test_ids = test['house_id']

In [23]:
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns

In [24]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

In [28]:
reg = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=150, max_depth=20, random_state=42, n_jobs=-1))
])

In [37]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [39]:
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10]
}

In [41]:
search = RandomizedSearchCV(
    reg, param_distributions=param_grid, n_iter=5, cv=3,
    scoring='neg_root_mean_squared_error', n_jobs=-1, random_state=42
)

In [42]:
search.fit(X_train, y_train)
best_model = search.best_estimator_
print("Best parameters:", search.best_params_)

Best parameters: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 5, 'regressor__max_depth': 20}


In [43]:
y_pred = best_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R²  : {r2:.4f}")

RMSE: 1662584.01
MAE : 100105.62
R²  : 0.0241


In [44]:
import pandas as pd
metrics = {'RMSE': rmse, 'MAE': mae, 'R²': r2}
pd.DataFrame(metrics, index=['Score'])

Unnamed: 0,RMSE,MAE,R²
Score,1662584.0,100105.622257,0.024108


In [45]:
test_pred = best_model.predict(X_test)
submission = pd.DataFrame({
    'house_id': test_ids,
    'predicted_price': test_pred
})
submission.to_csv('EM02_cloud9_Task2_Predictions.csv', index=False)