In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Load the data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
# Data Cleaning and Preprocessing
train_data.drop('Id', axis=1, inplace=True)
test_ids = test_data['Id']
test_data.drop('Id', axis=1, inplace=True)

In [4]:
# Fill missing values with median for numerical features
for col in train_data.select_dtypes(include=[np.number]).columns:
    train_data[col].fillna(train_data[col].median(), inplace=True)

In [5]:
# Fill missing values with median for numerical features
for col in test_data.select_dtypes(include=[np.number]).columns:
    test_data[col].fillna(test_data[col].median(), inplace=True)

In [6]:
# Fill missing values with mode for categorical features
for col in train_data.select_dtypes(include='object').columns:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

In [7]:
# Feature Engineering

# Filling missing values with mode for categorical features and median for numerical features
cols_with_missing = ['MSZoning', 'LotFrontage', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual',
                     'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
                     'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu',
                     'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
                     'PoolQC', 'Fence', 'MiscFeature', 'SaleType']

for col in cols_with_missing:
    if col in ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']:
        train_data[col] = train_data.groupby('Neighborhood')[col].transform(lambda x: x.fillna(x.median()))
        test_data[col] = test_data.groupby('Neighborhood')[col].transform(lambda x: x.fillna(x.median()))
    elif train_data[col].dtype == 'object':
        train_data[col].fillna(train_data[col].mode()[0], inplace=True)
        test_data[col].fillna(train_data[col].mode()[0], inplace=True)
    else:
        train_data[col].fillna(train_data[col].median(), inplace=True)
        test_data[col].fillna(train_data[col].median(), inplace=True)

# Adding new features
train_data['TotalSF'] = train_data['TotalBsmtSF'] + train_data['1stFlrSF'] + train_data['2ndFlrSF']
train_data['TotalBathrooms'] = train_data['FullBath'] + (0.5 * train_data['HalfBath']) + train_data['BsmtFullBath'] + (0.5 * train_data['BsmtHalfBath'])
train_data['TotalPorchSF'] = train_data['OpenPorchSF'] + train_data['EnclosedPorch'] + train_data['3SsnPorch'] + train_data['ScreenPorch']

test_data['TotalSF'] = test_data['TotalBsmtSF'] + test_data['1stFlrSF'] + test_data['2ndFlrSF']
test_data['TotalBathrooms'] = test_data['FullBath'] + (0.5 * test_data['HalfBath']) + test_data['BsmtFullBath'] + (0.5 * test_data['BsmtHalfBath'])
test_data['TotalPorchSF'] = test_data['OpenPorchSF'] + test_data['EnclosedPorch'] + test_data['3SsnPorch'] + test_data['ScreenPorch']

# Encoding categorical features
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object']
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])


In [8]:
# Hyperparameter Tuning
rf = RandomForestRegressor()
params = {'n_estimators': [100, 200, 300],
          'max_depth': [10, 20, 30],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1)
grid_search.fit(train_data.drop('SalePrice', axis=1), train_data['SalePrice'])
best_params = grid_search.best_params_

In [9]:
# Model training
rf = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                           max_depth=best_params['max_depth'],
                           min_samples_split=best_params['min_samples_split'],
                           min_samples_leaf=best_params['min_samples_leaf'])
rf.fit(train_data.drop('SalePrice', axis=1), train_data['SalePrice'])

RandomForestRegressor(max_depth=30, n_estimators=200)

In [10]:
# Model Prediction
predictions = rf.predict(test_data)

In [11]:
# Create a submission file
submission_df = pd.DataFrame({'Id': test_ids , 'SalePrice': predictions})
submission_df.to_csv('submission.csv', index=False)