# House Price Prediction

In [1]:
# Set up code checking
from learntools.core import binder
binder.bind(globals())
from learntools.machine_learning.ex7 import *

# Set up filepaths
import os
if not os.path.exists("../input/train.csv"):
    os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")  
    os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv") 

In [2]:
# Import helpful libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()

# Load the data, and separate the target
iowa_file_path = '../input/train.csv'
# path to file you will use for predictions
test_data_path = '../input/test.csv'
# read test data file using pandas
test_data = pd.read_csv(test_data_path)
home_data = pd.read_csv(iowa_file_path)
y = home_data.SalePrice

## Trying to make it more efficient

In [3]:
# select new features
new_features = [
    'MSSubClass',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'

]
Xn = home_data[new_features]
Xn.columns

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [4]:
Xn.shape

(1460, 25)

In [5]:
# Let's check the data for the missing values
zeros = (Xn == 0).astype(int).sum()
print(zeros)

MSSubClass          0
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
1stFlrSF            0
2ndFlrSF          829
LowQualFinSF     1434
GrLivArea           0
FullBath            9
HalfBath          913
BedroomAbvGr        6
KitchenAbvGr        1
TotRmsAbvGrd        0
Fireplaces        690
WoodDeckSF        761
OpenPorchSF       656
EnclosedPorch    1252
3SsnPorch        1436
ScreenPorch      1344
PoolArea         1453
MiscVal          1408
MoSold              0
YrSold              0
dtype: int64


In [6]:
# Let's convert this Series to be in term of percentage of the total DataFrame
100* (Xn == 0).astype(int).sum()/len(Xn)

MSSubClass        0.000000
LotArea           0.000000
OverallQual       0.000000
OverallCond       0.000000
YearBuilt         0.000000
YearRemodAdd      0.000000
1stFlrSF          0.000000
2ndFlrSF         56.780822
LowQualFinSF     98.219178
GrLivArea         0.000000
FullBath          0.616438
HalfBath         62.534247
BedroomAbvGr      0.410959
KitchenAbvGr      0.068493
TotRmsAbvGrd      0.000000
Fireplaces       47.260274
WoodDeckSF       52.123288
OpenPorchSF      44.931507
EnclosedPorch    85.753425
3SsnPorch        98.356164
ScreenPorch      92.054795
PoolArea         99.520548
MiscVal          96.438356
MoSold            0.000000
YrSold            0.000000
dtype: float64

In [7]:
100* (test_data == 0).astype(int).sum()/len(Xn)

Id                0.000000
MSSubClass        0.000000
MSZoning          0.000000
LotFrontage       0.000000
LotArea           0.000000
                   ...    
MiscVal          96.438356
MoSold            0.000000
YrSold            0.000000
SaleType          0.000000
SaleCondition     0.000000
Length: 80, dtype: float64

In [8]:
#drop list
drop_list = ['LowQualFinSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']

In [9]:
Xn = Xn.drop(drop_list, axis=1)

In [10]:
Xn.describe()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,MoSold,YrSold
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,10516.828082,6.099315,5.575342,1971.267808,1984.865753,1162.626712,346.992466,1515.463699,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,94.244521,46.660274,6.321918,2007.815753
std,42.300571,9981.264932,1.382997,1.112799,30.202904,20.645407,386.587738,436.528436,525.480383,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,125.338794,66.256028,2.703626,1.328095
min,20.0,1300.0,1.0,1.0,1872.0,1950.0,334.0,0.0,334.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,7553.5,5.0,5.0,1954.0,1967.0,882.0,0.0,1129.5,1.0,0.0,2.0,1.0,5.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,9478.5,6.0,5.0,1973.0,1994.0,1087.0,0.0,1464.0,2.0,0.0,3.0,1.0,6.0,1.0,0.0,25.0,6.0,2008.0
75%,70.0,11601.5,7.0,6.0,2000.0,2004.0,1391.25,728.0,1776.75,2.0,1.0,3.0,1.0,7.0,1.0,168.0,68.0,8.0,2009.0
max,190.0,215245.0,10.0,9.0,2010.0,2010.0,4692.0,2065.0,5642.0,3.0,2.0,8.0,3.0,14.0,3.0,857.0,547.0,12.0,2010.0


In [11]:
Xn.shape

(1460, 19)

In [12]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(Xn, y, random_state=42)

In [13]:
from xgboost import XGBRegressor

xgb=XGBRegressor(n_estimato=1000,learning_rate=0.05)


In [14]:
#fit the model
xgb.fit(train_X,train_y)

Parameters: { "n_estimato" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimato=1000,
             n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, ...)

In [15]:
#get predictions
y_predi=xgb.predict(val_X)

In [16]:
#to calculate mean absolute error MAE

from sklearn.metrics import mean_absolute_error

mae=mean_absolute_error(y_predi,val_y)
print('MAE : ',mae)

MAE :  19207.65571489726


In [17]:
# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features
test_X = test_data[new_features]
test_X = test_X.drop(drop_list, axis=1)

In [18]:
#prediction
prediction=xgb.predict(test_X)

Before submitting, run a check to make sure your `test_preds` have the right format.

# Generate a submission

Run the code cell below to generate a CSV file with your predictions that you can use to submit to the competition.

In [19]:

# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': prediction})
output.to_csv('submission.csv', index=False)
output.head()

Unnamed: 0,Id,SalePrice
0,1461,127155.34375
1,1462,156434.140625
2,1463,177259.953125
3,1464,185335.890625
4,1465,208750.078125
