<a href="https://www.kaggle.com/code/laplacecherub/house-price-prediction-top-23?scriptVersionId=93045580" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# House Price EDA and Prediction

![](https://i.imgur.com/Euc4ZyX.png)

## Initial Code and Data Import

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
house_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

## EDA

In [3]:
house_data.describe

<bound method NDFrame.describe of         Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour 

In [4]:
house_data.shape

(1460, 81)

## Predictions

In [5]:
X = house_data.drop('SalePrice', axis=1)
y = house_data.SalePrice

In [6]:
X.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [7]:
X['Topography'] = X['LotConfig'] + X['LandContour']
X['Geometry'] = X['LotArea'] / X['LotFrontage']
X['TotalIndoorSqFt'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF'] + X['GarageArea']
X['HouseToYardRatio'] = X['TotalIndoorSqFt'] / X['LotArea']
X['HouseToPoolRatio'] = X['TotalIndoorSqFt'] / (X['PoolArea'] + 1)
X['Value'] = X['OverallCond'] * X['OverallQual']
X['Condition'] = X['Condition1'] + X['ExterCond']
X['YardToSeatingAreaRatio'] =  (X['WoodDeckSF'] + X['OpenPorchSF'] + 1) / X['LotArea']
X['PriceFeatures'] = X['Fireplaces'] * X['TotRmsAbvGrd']
X['Recency'] = X['YearRemodAdd'] - X['YearBuilt']

In [8]:
X.YearBuilt.unique()

array([2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1939, 1965,
       2005, 1962, 2006, 1960, 1929, 1970, 1967, 1958, 1930, 2002, 1968,
       2007, 1951, 1957, 1927, 1920, 1966, 1959, 1994, 1954, 1953, 1955,
       1983, 1975, 1997, 1934, 1963, 1981, 1964, 1999, 1972, 1921, 1945,
       1982, 1998, 1956, 1948, 1910, 1995, 1991, 2009, 1950, 1961, 1977,
       1985, 1979, 1885, 1919, 1990, 1969, 1935, 1988, 1971, 1952, 1936,
       1923, 1924, 1984, 1926, 1940, 1941, 1987, 1986, 2008, 1908, 1892,
       1916, 1932, 1918, 1912, 1947, 1925, 1900, 1980, 1989, 1992, 1949,
       1880, 1928, 1978, 1922, 1996, 2010, 1946, 1913, 1937, 1942, 1938,
       1974, 1893, 1914, 1906, 1890, 1898, 1904, 1882, 1875, 1911, 1917,
       1872, 1905])

In [9]:
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 50 and 
                        X[cname].dtype == "object"]

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = XGBRegressor(random_state=42, n_estimators=350, max_depth=3, learning_rate=0.1, booster='dart')

In [12]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)

# Evaluate the model
score = mean_squared_error(y_test, preds, squared=False)
print('RMSE:', score)

RMSE: 27536.610822238952


## Model Optimization

In [13]:
# parameters = {
#     'max_depth': [3, 5, 10, None],
#     'n_estimators': [100, 200, 300, 400, 500],
#     'learning_rate': [0.01, 0.1, 0.5],
#     'booster' : ['gbtree', 'gblinear', 'dart']
# }

# rv = RandomizedSearchCV(model,
#                         param_distributions=parameters,
#                         n_iter=25,
#                         cv=5,
#                         n_jobs=-1,
#                         random_state=42)

# param_X = preprocessor.fit_transform(X)
# rv.fit(param_X, y)

# rv.best_params_, rv.best_score_

## Submission

In [14]:
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test['Topography'] = test['LotConfig'] + test['LandContour']
test['Geometry'] = test['LotArea'] / test['LotFrontage']
test['TotalIndoorSqFt'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF'] + test['GarageArea']
test['HouseToYardRatio'] = test['TotalIndoorSqFt'] / test['LotArea']
test['HouseToPoolRatio'] = test['TotalIndoorSqFt'] / (test['PoolArea'] + 1)
test['Value'] = test['OverallCond'] * test['OverallQual']
test['Condition'] = test['Condition1'] + test['ExterCond']
test['YardToSeatingAreaRatio'] =  (test['WoodDeckSF'] + test['OpenPorchSF'] + 1) / test['LotArea']
test['PriceFeatures'] = test['Fireplaces'] * test['TotRmsAbvGrd']
test['Recency'] = test['YearRemodAdd'] - test['YearBuilt']
pipeline.fit(X, y)
preds = pipeline.predict(test)
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': preds})
submission.to_csv('submission.csv', index=False)