In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv(
    "/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv(
    "/kaggle/input/house-prices-advanced-regression-techniques/test.csv")


In [6]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

categorical_cols_test = df_test.select_dtypes(include=['object']).columns
numerical_cols_test = df_test.select_dtypes(
    include=['float64', 'int64']).columns

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

label_encoder = LabelEncoder()
for col in categorical_cols_test:
    df_test[col] = label_encoder.fit_transform(df_test[col])

df = df.drop('Id', axis=1)
df_test = df_test.drop('Id', axis=1)

X = df.iloc[0:1460, 0:79]
y = df.iloc[0:1460, 79]

X_test = df_test.iloc[0:1460, 0:79]

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X = scaler_X.fit_transform(X)
y = scaler_y.fit_transform(y.values.reshape(-1, 1))
X_test = scaler_X.transform(X_test)


In [7]:
# to treat the NaN values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

imputer = SimpleImputer(strategy='mean')
X_test_imputed = imputer.fit_transform(X_test)


In [8]:
# reducing the dimensionality of the problem
from sklearn.decomposition import PCA

pca = PCA(n_components=58)
X_pca = pca.fit_transform(X_imputed)

pca = PCA(n_components=58)
X_test_pca = pca.fit_transform(X_test_imputed)


In [9]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# defining the regression models
models = {
    'Linear Regression': LinearRegression(),
    'K Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Support Vector Regressor': SVR(),
    'AdaBoost Regressor': AdaBoostRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBoost Regressor': XGBRegressor(),
}

# defining the hyperparameter grids for the models
param_grids = {
    'Linear Regression': {},
    'K Nearest Neighbors': {'n_neighbors': [3, 5, 7]},
    'Decision Tree': {'max_depth': [None, 5, 10]},
    'Gaussian Naive Bayes': {},
    'Support Vector Regressor': {'C': [1, 10, 100]},
    'AdaBoost Regressor': {'n_estimators': [50, 100, 200]},
    'Random Forest Regressor': {'n_estimators': [50, 100, 200]},
    'Gradient Boosting Regressor': {'n_estimators': [50, 100, 200]},
    'XGBoost Regressor': {'n_estimators': [50, 100, 200]}
}


In [10]:
# iterating over the models and performing cross validation for parameter tuning
for model_name, model in models.items():
    param_grid = param_grids[model_name]

    # performing grid search with cross validation for parameter tuning
    grid_search = GridSearchCV(
        model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_pca, y)

    # getting the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # making predictions on the testing data using the best model
    y_pred = best_model.predict(X_test_pca)
    y_pred_actual = scaler_y.inverse_transform(y_pred.reshape(-1, 1))

    print(f'{model_name} - Best Parameters: {best_params}')
    print(f'{model_name} - Predicted Values: {y_pred_actual}')
    print('\n')


Linear Regression - Best Parameters: {}
Linear Regression - Predicted Values: [[119864.97486864]
 [333635.99822852]
 [195756.33729013]
 ...
 [179853.35688215]
 [140284.03922523]
 [241789.90283956]]


K Nearest Neighbors - Best Parameters: {'n_neighbors': 7}
K Nearest Neighbors - Predicted Values: [[130564.28571429]
 [152757.14285714]
 [195433.14285714]
 ...
 [168642.85714286]
 [136700.        ]
 [223014.28571429]]


Decision Tree - Best Parameters: {'max_depth': 5}
Decision Tree - Predicted Values: [[141835.87692308]
 [174583.33333333]
 [191788.50515464]
 ...
 [159094.14814815]
 [141835.87692308]
 [222453.        ]]


Support Vector Regressor - Best Parameters: {'C': 10}
Support Vector Regressor - Predicted Values: [[119662.02389575]
 [193102.83120205]
 [162993.84205887]
 ...
 [176321.52722311]
 [138840.93129587]
 [245656.16398606]]


AdaBoost Regressor - Best Parameters: {'n_estimators': 50}
AdaBoost Regressor - Predicted Values: [[124822.28333333]
 [208624.3125    ]
 [187870.60406091