<a href="https://colab.research.google.com/github/Kalana-Lakshan/Kaggle_Competitions_House-Prices---Advanced-Regression-Techniques/blob/main/House_Prices_Advanced_Regression_Techniques_Advanced_techniques_used.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
!pip install scikit-learn
import sklearn



In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
y = np.log1p(train_data.SalePrice)

In [4]:
X = train_data.drop('SalePrice',axis = 1)

In [5]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['float64','int64']]

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [8]:
numerical_transformer = SimpleImputer(strategy = 'median')
categorical_transformer = Pipeline(steps = [('imputer',SimpleImputer(strategy = 'most_frequent')),('onehot',OneHotEncoder(handle_unknown = 'ignore'))])
preprocessor = ColumnTransformer(transformers =[('cat',categorical_transformer,categorical_cols),('num',numerical_transformer,numerical_cols)])


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

my_pipeline5 = Pipeline([
    ('preprocessor', preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [None, 10, 20, 30, 50],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", 0.3, 0.5],
    "model__bootstrap": [True]
}


random_search = RandomizedSearchCV(
    my_pipeline5,
    param_dist,
    n_iter=20,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

X_train, X_test, y_train, y_test = train_test_split(X, y)
random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_
best_rf.fit(X,y)
predictions5 = np.expm1(best_rf.predict(test_data))


In [12]:
output5 = pd.DataFrame({'Id':test_data.Id,'SalePrice':predictions5})
output5.to_csv('submission5.csv',index = False)

Use CatBoostRegressor

In [14]:
!pip install catboost
from catboost import CatBoostRegressor



In [16]:
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].astype(str)
        test_data[col] = test_data[col].astype(str)

cat_features = [i for i, col in enumerate(X.columns) if X[col].dtype == 'object']

model = CatBoostRegressor(
    iterations = 2000,
    learning_rate = 0.03,
    depth = 6,
    eval_metric = "RMSE",
    verbose = 100
)

model.fit(X,y,cat_features = cat_features)
predictions6 = np.expm1(model.predict(test_data))

0:	learn: 0.3912722	total: 82.1ms	remaining: 2m 44s
100:	learn: 0.1381466	total: 3.07s	remaining: 57.8s
200:	learn: 0.1086537	total: 5.96s	remaining: 53.4s
300:	learn: 0.0990203	total: 10.3s	remaining: 58.2s
400:	learn: 0.0918852	total: 13.4s	remaining: 53.3s
500:	learn: 0.0862093	total: 16.5s	remaining: 49.2s
600:	learn: 0.0811513	total: 20.1s	remaining: 46.9s
700:	learn: 0.0766222	total: 24.1s	remaining: 44.6s
800:	learn: 0.0726677	total: 27.1s	remaining: 40.6s
900:	learn: 0.0689937	total: 30.3s	remaining: 36.9s
1000:	learn: 0.0655394	total: 34.7s	remaining: 34.6s
1100:	learn: 0.0627707	total: 37.9s	remaining: 30.9s
1200:	learn: 0.0600593	total: 41s	remaining: 27.3s
1300:	learn: 0.0573621	total: 44.2s	remaining: 23.7s
1400:	learn: 0.0547644	total: 48.6s	remaining: 20.8s
1500:	learn: 0.0524678	total: 51.7s	remaining: 17.2s
1600:	learn: 0.0499936	total: 54.8s	remaining: 13.7s
1700:	learn: 0.0481477	total: 59.1s	remaining: 10.4s
1800:	learn: 0.0459878	total: 1m 2s	remaining: 6.91s
1900:

In [17]:
output6 = pd.DataFrame({'Id':test_data.Id,'SalePrice':predictions6})
output6.to_csv('submission6.csv',index = False)