In [2]:
from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint

from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import metrics

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [3]:
#Loaded training and test data.
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [4]:
#Separated categorical and numerical.
y = train_data['SalePrice']
X = train_data.drop(columns=['SalePrice'])

num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [5]:
#Preprocessed features


cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(sparse_output = False, drop='first', handle_unknown='ignore'))])
num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

preprocess = ColumnTransformer(
    transformers=[
        ("cat_preprocess", cat_vals, cat_cols),
        ("num_preprocess", num_vals, num_cols)
    ]
)


In [6]:
#Split data
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

In [7]:
#Define ML model
regressor = RandomForestRegressor(random_state=42)

In [8]:
#Define pipeline
pipeline = Pipeline([('preprocess', preprocess), ("regr", regressor)])

In [9]:
pipeline.fit(x_train, y_train)

In [10]:
y_pred = pipeline.predict(x_test)



In [11]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

RMSE: 32340.938101280728


In [12]:
pipeline.fit(X, y)
test_preds = pipeline.predict(test_data)

In [13]:
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
