In [1]:
import pandas as pd
import numpy as np

In [2]:
melb= pd.read_csv("/kaggle/input/melb-data/melb_data.csv")

In [3]:
melb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396 entries, 0 to 18395
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     18396 non-null  int64  
 1   Suburb         18396 non-null  object 
 2   Address        18396 non-null  object 
 3   Rooms          18396 non-null  int64  
 4   Type           18396 non-null  object 
 5   Price          18396 non-null  float64
 6   Method         18396 non-null  object 
 7   SellerG        18396 non-null  object 
 8   Date           18396 non-null  object 
 9   Distance       18395 non-null  float64
 10  Postcode       18395 non-null  float64
 11  Bedroom2       14927 non-null  float64
 12  Bathroom       14925 non-null  float64
 13  Car            14820 non-null  float64
 14  Landsize       13603 non-null  float64
 15  BuildingArea   7762 non-null   float64
 16  YearBuilt      8958 non-null   float64
 17  CouncilArea    12233 non-null  object 
 18  Lattit

In [4]:
melb.drop(["Suburb","Address","SellerG","Date","CouncilArea","BuildingArea","YearBuilt"],axis=1,inplace=True)

In [5]:
melb=pd.get_dummies(melb,["Type","Method","Regionname"])

In [6]:
melb=melb.dropna()

In [7]:
melb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13518 entries, 0 to 18394
Data columns (total 28 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             13518 non-null  int64  
 1   Rooms                                  13518 non-null  int64  
 2   Price                                  13518 non-null  float64
 3   Distance                               13518 non-null  float64
 4   Postcode                               13518 non-null  float64
 5   Bedroom2                               13518 non-null  float64
 6   Bathroom                               13518 non-null  float64
 7   Car                                    13518 non-null  float64
 8   Landsize                               13518 non-null  float64
 9   Lattitude                              13518 non-null  float64
 10  Longtitude                             13518 non-null  float64
 11  Pr

In [8]:
# Import Sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [9]:
# generate x and y
X=melb.drop("Price",axis=1).values
y=melb["Price"].values

## Train & test split

Train & test split is used to split the data into train and test to make sure that the model is not just save the result.

In [10]:
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size=.3,random_state=1)

## LinearRegression

Using the linear equation to check if the data is linear or not

In [11]:
linear=LinearRegression()
linear.fit(x_train,y_train)

y_train_pred=linear.predict(x_train)
print("linear MAE for training : ",round(mean_absolute_error(y_train,y_train_pred),2))

y_test_pred=linear.predict(x_test)
print("linear MAE for testing : ",round(mean_absolute_error(y_test,y_test_pred),2))

linear MAE for training :  273909.15
linear MAE for testing :  272400.07


## DecisionTreeRegressor & GridSearchCV

Using the decision tree is learn from the data till get the leaf all most be overfit in traing so we use the parameters & GridSearch to choose the best parameters

In [12]:
GS=GridSearchCV(estimator=DecisionTreeRegressor(),param_grid={
    "max_depth":range(5,18),
    "max_features":[26],
    "max_leaf_nodes":[600,650,700,750,800]
})
GS.fit(x_train,y_train)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(5, 18), 'max_features': [26],
                         'max_leaf_nodes': [600, 650, 700, 750, 800]})

In [13]:
pd.DataFrame(GS.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_max_leaf_nodes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.029399,0.006524,0.000897,0.000101,5,26,600,"{'max_depth': 5, 'max_features': 26, 'max_leaf...",0.628505,0.635772,0.543363,0.596280,0.548744,0.590533,0.038711,45
1,0.025964,0.000674,0.000870,0.000034,5,26,650,"{'max_depth': 5, 'max_features': 26, 'max_leaf...",0.632455,0.630398,0.537435,0.588748,0.562411,0.590289,0.037309,46
2,0.026454,0.000418,0.000856,0.000013,5,26,700,"{'max_depth': 5, 'max_features': 26, 'max_leaf...",0.632490,0.624235,0.551845,0.596661,0.554169,0.591880,0.033893,42
3,0.025713,0.000491,0.000853,0.000019,5,26,750,"{'max_depth': 5, 'max_features': 26, 'max_leaf...",0.632490,0.635948,0.549128,0.586945,0.562265,0.593355,0.035523,40
4,0.025869,0.000165,0.000836,0.000010,5,26,800,"{'max_depth': 5, 'max_features': 26, 'max_leaf...",0.632490,0.635948,0.546209,0.590009,0.562384,0.593408,0.036163,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.053225,0.000486,0.000984,0.000021,17,26,600,"{'max_depth': 17, 'max_features': 26, 'max_lea...",0.641276,0.678023,0.602536,0.613732,0.467915,0.600696,0.071317,32
61,0.053105,0.000728,0.000995,0.000026,17,26,650,"{'max_depth': 17, 'max_features': 26, 'max_lea...",0.631878,0.662713,0.622068,0.615357,0.481907,0.602785,0.062577,30
62,0.054869,0.000791,0.001003,0.000012,17,26,700,"{'max_depth': 17, 'max_features': 26, 'max_lea...",0.545834,0.686961,0.599426,0.443014,0.464615,0.547970,0.089370,64
63,0.054656,0.000519,0.001023,0.000052,17,26,750,"{'max_depth': 17, 'max_features': 26, 'max_lea...",0.603669,0.656701,0.607169,0.650931,0.474223,0.598539,0.065853,35


In [14]:
best=GS.best_params_
best

{'max_depth': 7, 'max_features': 26, 'max_leaf_nodes': 600}

In [15]:
best_DT=DecisionTreeRegressor(max_depth=7,max_features=26,max_leaf_nodes=650)
best_DT.fit(x_train,y_train)

y_train_pred=best_DT.predict(x_train)
print("DecisionTree MAE for training : ",round(mean_absolute_error(y_train,y_train_pred),2))

y_test_pred=best_DT.predict(x_test)
print("DecisionTree MAE for testing : ",round(mean_absolute_error(y_test,y_test_pred),2))

DecisionTree MAE for training :  208864.35
DecisionTree MAE for testing :  229973.75


As we see above the best estimator in DT with get 21K MAE in testing

In [16]:
KNN=KNeighborsRegressor()
KNN.fit(x_train,y_train)

y_train_pred=KNN.predict(x_train)
print("DecisionTree MAE for training : ",round(mean_absolute_error(y_train,y_train_pred),2))

y_test_pred=KNN.predict(x_test)
print("DecisionTree MAE for testing : ",round(mean_absolute_error(y_test,y_test_pred),2))

DecisionTree MAE for training :  267915.26
DecisionTree MAE for testing :  326456.66


In [17]:
for i,x in zip([100,150,200],[40,45,50]):
    RF=RandomForestRegressor(n_estimators=i,max_depth=x)
    RF.fit(x_train,y_train)
    y_train_pred=RF.predict(x_train)
    print("RandomForest MAE for training when n_estimators={} & max_depth={} : ".format({i},{x}),round(mean_absolute_error(y_train,y_train_pred),2))
    y_test_pred=RF.predict(x_test)
    print("RandomForest MAE for testing when n_estimators={} & max_depth={} : ".format({i},{x}),round(mean_absolute_error(y_test,y_test_pred),2))

RandomForest MAE for training when n_estimators={100} & max_depth={40} :  64636.06
RandomForest MAE for testing when n_estimators={100} & max_depth={40} :  167390.58
RandomForest MAE for training when n_estimators={150} & max_depth={45} :  64267.18
RandomForest MAE for testing when n_estimators={150} & max_depth={45} :  166984.26
RandomForest MAE for training when n_estimators={200} & max_depth={50} :  63810.52
RandomForest MAE for testing when n_estimators={200} & max_depth={50} :  167458.15


Conclusion:

    * The best model we can use is RandomForest with a mean_absolute_error in testing 166K