<h1 style="text-align:center">House Price Prediction Technique</h1>

# Step-1 Import Data

In [198]:
# Data Processing
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.model_selection import RandomizedSearchCV

# Exploring the Data

In [199]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
sample_submission = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [200]:
train.head()

In [201]:
train.shape

In [202]:
test.head()

In [203]:
test.shape

## Target Value: SalePrice

In [204]:
plt.figure(figsize=(20,10))
b = sns.distplot(train['SalePrice'])
b.set_title("SalePrice Distribution");

In [205]:
plt.figure(figsize=(20,10))
b = sns.boxplot(y = 'SalePrice', data = train)
b.set_title("SalePrice Distribution");

In [206]:
len(train[train['SalePrice'] > 700000])

In [207]:
train.shape

In [208]:
train = train[train['SalePrice'] <= 700000]

In [209]:
train.shape

# Handling missing values

In [210]:
train.columns[train.isna().any()].tolist()

In [211]:
test.columns[test.isna().any()].tolist()

In [212]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [213]:
train = train.drop(['PoolQC'], axis=1)
test = test.drop(['PoolQC'], axis=1)

train = train.drop(['MiscFeature'], axis=1)
test = test.drop(['MiscFeature'], axis=1)

train = train.drop(['Alley'], axis=1)
test = test.drop(['Alley'], axis=1)

train = train.drop(['Fence'], axis=1)
test = test.drop(['Fence'], axis=1)

train = train.drop(['FireplaceQu'], axis=1)
test = test.drop(['FireplaceQu'], axis=1)

train = train.drop(['LotFrontage'], axis=1)
test = test.drop(['LotFrontage'], axis=1)

In [214]:
train = train.fillna(train.median())
test = test.fillna(test.median())

In [215]:
test['MSZoning'] = test['MSZoning'].fillna('None')

In [216]:
train = train.drop(['Utilities'], axis=1)
test = test.drop(['Utilities'], axis=1)

In [217]:
test['Exterior1st'] = test['Exterior1st'].fillna('None')

train.loc[train['Exterior1st'].value_counts()[train['Exterior1st']].values < 18,'Exterior1st'] = 'Rare'
test.loc[test['Exterior1st'].value_counts()[test['Exterior1st']].values < 18,'Exterior1st'] = 'Rare'

In [218]:
test['Exterior2nd'] = test['Exterior2nd'].fillna('None')

train.loc[train['Exterior2nd'].value_counts()[train['Exterior2nd']].values < 10,'Exterior2nd'] = 'Rare'
test.loc[test['Exterior2nd'].value_counts()[test['Exterior2nd']].values < 10,'Exterior2nd'] = 'Rare'

In [219]:
train['MasVnrType'] = train['MasVnrType'].fillna('Missing')
test['MasVnrType'] = test['MasVnrType'].fillna('Missing')

In [220]:
train['BsmtQual'] = train['BsmtQual'].fillna('None')
test['BsmtQual'] = test['BsmtQual'].fillna('None')

In [221]:
train['BsmtCond'] = train['BsmtCond'].fillna('None')
test['BsmtCond'] = test['BsmtCond'].fillna('None')

In [222]:
train['BsmtExposure'] = train['BsmtExposure'].fillna('None')
test['BsmtExposure'] = test['BsmtExposure'].fillna('None')

In [223]:
train['BsmtFinType1'] = train['BsmtFinType1'].fillna('None')
test['BsmtFinType1'] = test['BsmtFinType1'].fillna('None')

In [224]:
train['BsmtFinType2'] = train['BsmtFinType2'].fillna('None')
test['BsmtFinType2'] = test['BsmtFinType2'].fillna('None')

In [225]:
train['Electrical'] = train['Electrical'].fillna('None')
test['Electrical'] = test['Electrical'].fillna('None')

In [226]:
test['KitchenQual'] = test['KitchenQual'].fillna('None')

In [227]:
test['Functional'] = test['Functional'].fillna('None')

In [228]:
train['GarageType'] = train['GarageType'].fillna('None')
test['GarageType'] = test['GarageType'].fillna('None')

In [229]:
train['GarageFinish'] = train['GarageFinish'].fillna('None')
test['GarageFinish'] = test['GarageFinish'].fillna('None')

In [230]:
train['GarageQual'] = train['GarageQual'].fillna('None')
test['GarageQual'] = test['GarageQual'].fillna('None')

In [231]:
train['GarageCond'] = train['GarageCond'].fillna('None')
test['GarageCond'] = test['GarageCond'].fillna('None')

In [232]:
train['SaleType'] = train['SaleType'].fillna('None')
test['SaleType'] = test['SaleType'].fillna('None')

In [233]:
train.isna().all().sum()

In [234]:
test.isna().all().sum()

# Feature Engineering

In [235]:
y_train = train['SalePrice'].values
df = pd.concat((train, test)).reset_index(drop=True)
df.drop(['SalePrice'], axis=1, inplace=True)

In [236]:
from sklearn.preprocessing import LabelEncoder
cols = ('BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))


In [237]:
plt.figure(figsize=(25,25))
ax = sns.heatmap(train.corr(), cmap = "coolwarm", annot=True, linewidth=2)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [238]:
df = pd.get_dummies(df)
print(df.shape)

# Modeling

In [239]:
train = df[df['Id'] < 1461]
test = df[df['Id'] >= 1461]

In [240]:
# Everything except target variable
X = train

# Target variable
y = y_train

In [241]:
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [242]:
# Put models in a dictionary
models = {"Lasso": Lasso(tol=0.1), 
          "RandomForestRegressor" : RandomForestRegressor(),
          "GradientBoostingRegressor" : GradientBoostingRegressor(),
          "XGBRegressor": XGBRegressor(),
          "LGBMRegressor": LGBMRegressor()
}

# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels assosciated with training data
    y_test : labels assosciated with test data
    """
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Predicting target values
        y_pred = model.predict(X_test)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = np.sqrt(mean_squared_error(y_test, y_pred))
    return model_scores

In [243]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

In [244]:
gbr = GradientBoostingRegressor(n_estimators=5000)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

# Predicting values

In [245]:
y_pred = gbr.predict(test)


In [246]:
sample_submission['SalePrice'] = y_pred
sample_submission.to_csv("submission.csv", index=False)
sample_submission.head()