In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [33]:
df = pd.read_csv('melb_data.csv')

In [34]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [35]:
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [36]:
df.drop(['YearBuilt', 'BuildingArea', 'CouncilArea'], axis=1, inplace=True)

In [37]:
df['Car'].isnull().sum()

62

In [38]:
df['Car'].fillna(df['Car'].mode()[0], inplace=True)

In [39]:
df['Car'].isnull().sum()

0

In [40]:
df.isnull().sum()

Suburb           0
Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64

### Now, I will try to extract day, month, and year from "Date" feature.

In [41]:
df['Day'] = pd.to_datetime(df['Date'], format="%d/%m/%Y").dt.day

In [42]:
df['Month'] = pd.to_datetime(df['Date'], format="%d/%m/%Y").dt.month
df['Year'] = pd.to_datetime(df['Date'], format="%d/%m/%Y").dt.year

### After extracting required information from "Date" feature, we can now simply drop that feature

In [43]:
df.drop('Date', axis=1, inplace=True)

In [44]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Regionname,Propertycount,Day,Month,Year
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,144.9984,Northern Metropolitan,4019.0,3,12,2016
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,Northern Metropolitan,4019.0,4,2,2016
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,Northern Metropolitan,4019.0,4,3,2017
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,144.9969,Northern Metropolitan,4019.0,4,3,2017
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,Northern Metropolitan,4019.0,4,6,2016


In [45]:
df['Suburb'].nunique()

314

In [46]:
df.drop(['Suburb', 'Address', 'SellerG', 'Regionname'], axis=1, inplace=True)

In [47]:
df.head()

Unnamed: 0,Rooms,Type,Price,Method,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Day,Month,Year
0,2,h,1480000.0,S,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,144.9984,4019.0,3,12,2016
1,2,h,1035000.0,S,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,4019.0,4,2,2016
2,3,h,1465000.0,SP,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,4019.0,4,3,2017
3,3,h,850000.0,PI,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,144.9969,4019.0,4,3,2017
4,4,h,1600000.0,VB,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,4019.0,4,6,2016


In [48]:
df_trial = pd.get_dummies(df, drop_first=True)

In [50]:
df_trial.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Day,Month,Year,Type_t,Type_u,Method_S,Method_SA,Method_SP,Method_VB
0,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,144.9984,4019.0,3,12,2016,0,0,1,0,0,0
1,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,4019.0,4,2,2016,0,0,1,0,0,0
2,3,1465000.0,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,4019.0,4,3,2017,0,0,0,0,1,0
3,3,850000.0,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,144.9969,4019.0,4,3,2017,0,0,0,0,0,0
4,4,1600000.0,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,4019.0,4,6,2016,0,0,0,0,0,1


In [55]:
X = df_trial.drop('Price', axis=1)
y = df_trial['Price']

### Now we will create our model

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [58]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

In [59]:
model.fit(X_train, y_train)

DecisionTreeRegressor()

In [60]:
predictions = model.predict(X_test)

In [61]:
print(r2_score(y_test, predictions))

0.5614781359397443


### Testing using dataset

In [62]:
test_data = X.iloc[0]

In [63]:
model.predict([test_data]) # Exactly the price mentioned in the dataset.

array([1480000.])

## Here, we get the exact price mentioned in the dataset, which is 1480000.