In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

sns.set_style('darkgrid')
pd.set_option('display.max_columns', None) # to display all columns

In [2]:
## dataset 
df = pd.read_csv('melb_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
df.shape

(18396, 22)

In [5]:
## we are going to take some features from dataset 
df = df[['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt', 'Price']]
df.head()

Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt,Price
0,2,2.5,202.0,,,1480000.0
1,2,2.5,156.0,79.0,1900.0,1035000.0
2,3,2.5,134.0,150.0,1900.0,1465000.0
3,3,2.5,94.0,,,850000.0
4,4,2.5,120.0,142.0,2014.0,1600000.0


In [6]:
df.shape

(18396, 6)

In [7]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13797, 5)
(4599, 5)
(13797,)
(4599,)


In [9]:
## XGB Model

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

In [10]:
## training
xgb.fit(X_train, y_train)

In [11]:
## prediction 
pred = xgb.predict(X_test)

In [12]:
print("Mean Absolute Score is: ", mean_absolute_error(y_test, pred))
print("R2 Score is: ", r2_score(y_test, pred))

Mean Absolute Score is:  244166.85807376602
R2 Score is:  0.6271775651051935
