<a href="https://colab.research.google.com/github/JoelWekesa/Machine-and-Deep-Learning/blob/main/housingpricesprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, BaggingRegressor, VotingRegressor
from sklearn.svm import LinearSVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats.stats import pearsonr
from google.colab import drive

In [None]:
pd.set_option("display.max_rows", 100)
sns.set_style("darkgrid")
sns.set(rc = {'figure.figsize': (12, 10)})

In [None]:
drive.mount("/content/datasets")

Drive already mounted at /content/datasets; to attempt to forcibly remount, call drive.mount("/content/datasets", force_remount=True).


In [None]:
path = "/content/datasets/MyDrive/datasets/house-prices-advanced-regression-techniques/train.csv"

In [None]:
df = pd.read_csv(path)

In [None]:
df = df.drop(columns = ["Id", "MiscFeature", "Fence", "PoolQC", "Alley"])

In [None]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF'

In [None]:
df.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,24.689725,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,223.0,795.75,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,808.0,1298.25,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [None]:
min_price, max_price = df["SalePrice"].quantile([0.001, 0.999])

In [None]:
filt_min = (df["SalePrice"] < min_price)

In [None]:
min_df = df.loc[filt_min]

In [None]:
filt_max = (df["SalePrice"] > max_price)

In [None]:
max_df = df.loc[filt_max]

In [None]:
price_outliers = pd.concat([min_df, max_df])

In [None]:
df = df.drop(price_outliers.index)

In [None]:
encoder = LabelEncoder()
imputer = SimpleImputer()
hot = OneHotEncoder()
scaler = StandardScaler()

In [None]:
for i in df.columns:
  if df[i].dtype != "int" and df[i].dtype != "float":
    df[i] = df[i].fillna(df[i].value_counts().idxmax())
    df[i] = encoder.fit_transform(df[i])

In [None]:
X = df.drop(columns = ["SalePrice"])
X = imputer.fit_transform(X)

In [None]:
y = df["SalePrice"]

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
train_X = scaler.fit_transform(train_X)

In [None]:
test_X = scaler.transform(test_X)

In [None]:
regressor = LinearRegression()

In [None]:
regressor.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
predictions = regressor.predict(test_X)

In [None]:
r2_score(test_y, predictions)

0.8647015537788565

In [None]:
mean_squared_error(test_y, predictions)

968710363.4874146

In [None]:
rfc = RandomForestRegressor()

In [None]:
rfc.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
rfc_pred = rfc.predict(test_X)

In [None]:
r2_score(test_y, rfc_pred)

0.8855504270140075

In [None]:
ridge = RidgeCV()

In [None]:
ridge.fit(train_X, train_y)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [None]:
rid_pred = ridge.predict(test_X)

In [None]:
r2_score(test_y, rid_pred)

0.8650200405393471

In [None]:
lasso = LassoCV()

In [None]:
lasso.fit(train_X, train_y)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [None]:
lasso_pred = lasso.predict(test_X)

In [None]:
r2_score(test_y, lasso_pred)

0.8610023491718363

In [None]:
boost = GradientBoostingRegressor()

In [None]:
boost.fit(train_X, train_y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
boost_pred = boost.predict(test_X)

In [None]:
r2_score(test_y, boost_pred) 

0.9059350148323178

In [None]:
xboost = XGBRegressor()

In [None]:
xboost.fit(train_X, train_y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
xboost_pred = xboost.predict(test_X)

In [None]:
r2_score(test_y, xboost_pred)

0.9012047474964119

In [None]:
svr = LinearSVR()

In [None]:
svr.fit(train_X, train_y)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [None]:
svr_pred = svr.predict(test_X)

In [None]:
r2_score(test_y, svr_pred)

-4.461201767790967

In [None]:
bagging = BaggingRegressor()

In [None]:
bagging.fit(train_X, train_y)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [None]:
bagging_pred = bagging.predict(test_X)

In [None]:
r2_score(test_y, bagging_pred)

0.8603520528455453

In [None]:
estimators = [("xboost", xboost), ("boost", boost)]

In [None]:
stacked = StackingRegressor(estimators=estimators, final_estimator=boost)
estims = [("stack", stacked), ("xboost", xboost), ("boost", boost)]
voted = VotingRegressor(estimators=estims)

In [None]:
stacked.fit(train_X, train_y)
voted.fit(train_X, train_y)



VotingRegressor(estimators=[('stack',
                             StackingRegressor(cv=None,
                                               estimators=[('xboost',
                                                            XGBRegressor(base_score=0.5,
                                                                         booster='gbtree',
                                                                         colsample_bylevel=1,
                                                                         colsample_bynode=1,
                                                                         colsample_bytree=1,
                                                                         gamma=0,
                                                                         importance_type='gain',
                                                                         learning_rate=0.1,
                                                                         max_delta_step=0,
              

In [None]:
stacked_pred = stacked.predict(test_X)
voted_pred = voted.predict(test_X)

In [None]:
r2_score(test_y, stacked_pred)

0.8917408965641279

In [None]:
r2_score(test_y, voted_pred)

0.9051275287748691

In [None]:
path2 = "/content/datasets/MyDrive/datasets/house-prices-advanced-regression-techniques/test.csv"

In [None]:
tdf = pd.read_csv(path2)

In [None]:
tedf = tdf.drop(columns = ["Id", "MiscFeature", "Fence", "PoolQC", "Alley"])

In [None]:
for i in tedf.columns:
  if tedf[i].dtype != "int" and tedf[i].dtype != "float":
    tedf[i] = tedf[i].fillna(tedf[i].value_counts().idxmax())
    tedf[i] = encoder.fit_transform(tedf[i])

In [None]:
tedf = imputer.fit_transform(tedf)

In [None]:
tedf = scaler.transform(tedf)

In [None]:
f_predictions = voted.predict(tedf)

In [None]:
tdf = tdf[["Id"]]

In [None]:
tdf.insert(1, "SalePrice", f_predictions)

In [None]:
tdf.to_csv("/content/datasets/MyDrive/datasets/house-prices-advanced-regression-techniques/predictions.csv", index=False)