In [2]:
# Approach 1 - Drop the column
# When most of the values are missing in a column

# Approach 2 - Imputaion
# Fill the missing data with the mean value of that column

# Approach 3 - An extension to imputation
# we impute missing values but also keep track of which values were missing

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# pip install pandas
# pip install sklearn
# After, restart your VS Code

In [4]:
data = pd.read_csv('../Datasets/melb_data.csv')
data.sample(5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
5645,South Yarra,6/432 Punt Rd,2,u,450000.0,VB,hockingstuart,14/05/2016,3.3,3141.0,...,1.0,1.0,0.0,,,Stonnington,-37.8374,144.9871,Southern Metropolitan,14887.0
6604,Windsor,8/14 Newry St,2,u,474000.0,S,Beller,28/08/2016,5.1,3181.0,...,1.0,1.0,0.0,66.0,1985.0,Port Phillip,-37.8552,145.0005,Southern Metropolitan,4380.0
11984,Elwood,1/121 Brighton Rd,2,u,720000.0,S,Chisholm,29/07/2017,7.2,3184.0,...,1.0,2.0,797.0,90.0,1920.0,Port Phillip,-37.87449,144.99059,Southern Metropolitan,8989.0
11239,Vermont,11 Manhattan Sq,4,h,980000.0,S,hockingstuart,12/08/2017,17.2,3133.0,...,2.0,0.0,539.0,192.0,1978.0,Whitehorse,-37.83459,145.21264,Eastern Metropolitan,4181.0
9315,Seabrook,21 Catherine Rd,3,h,515000.0,SP,LJ,3/06/2017,15.5,3028.0,...,1.0,2.0,531.0,98.0,1985.0,Hobsons Bay,-37.87793,144.75626,Western Metropolitan,1793.0


In [5]:
data.shape

(13580, 21)

In [6]:
data.describe(include='all').isnull().sum()

Suburb           7
Address          7
Rooms            3
Type             7
Price            3
Method           7
SellerG          7
Date             7
Distance         3
Postcode         3
Bedroom2         3
Bathroom         3
Car              3
Landsize         3
BuildingArea     3
YearBuilt        3
CouncilArea      7
Lattitude        3
Longtitude       3
Regionname       7
Propertycount    3
dtype: int64

In [7]:
y = data.Price
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [8]:
features = data.drop(['Price'], axis=1)
features.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [10]:
X = features.select_dtypes(exclude='object')
X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.7996,144.9984,4019.0
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.7969,144.9969,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
def score(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    predicition = model.predict(X_test)
    return mean_absolute_error(y_test, predicition)

In [13]:
# Approach 1 - Drop the columns

cols = [col for col in X_train.columns if X_train[col].isnull().any()]

# cols = []
# for col in X_train.columns:
#     if X_train[col].isnull().any():
#         cols.append(col)

# print(cols)

new_X_train = X_train.drop(cols,axis=1)
new_X_test = X_test.drop(cols,axis=1)

print(f'Approach 1: {score(new_X_train, new_X_test, y_train, y_test)}')

Approach 1: 183550.22137772635


In [14]:
# Approach 2 - Imputation

myImputer = SimpleImputer()
imputed_X_train = pd.DataFrame(myImputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(myImputer.transform(X_test))

# imputed_X_test.head(3)

imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

imputed_X_test.head(3)

print(f'Approach 2: {score(imputed_X_train, imputed_X_test, y_train, y_test)}')

Approach 2: 178166.46269899711


In [16]:
# Appraoch 3 - An extension to imputation

X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

for col in cols:
    X_train_plus[col + "_was_missing"] = X_train_plus[col].isnull()
    X_test_plus[col + "_was_missing"] = X_test_plus[col].isnull()

myImputer = SimpleImputer()
imputed_X_train = pd.DataFrame(myImputer.fit_transform(X_train_plus))
imputed_X_test = pd.DataFrame(myImputer.transform(X_test_plus))

# imputed_X_test.head(3)

imputed_X_train.columns = X_train_plus.columns
imputed_X_test.columns = X_test_plus.columns

print(f'Approach 3: {score(imputed_X_train, imputed_X_test, y_train, y_test)}')
X_train_plus.head(8)


Approach 3: 178927.503183954


Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Car_was_missing,BuildingArea_was_missing,YearBuilt_was_missing
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0,False,True,False
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0,False,True,True
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0,False,True,True
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0,False,True,False
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0,False,False,False
547,5,9.7,3103.0,5.0,2.0,2.0,611.0,,,-37.8116,145.0789,5682.0,False,True,True
4655,4,9.9,3044.0,4.0,2.0,2.0,250.0,194.0,1983.0,-37.7319,144.9461,7485.0,False,False,False
6082,3,13.5,3020.0,3.0,1.0,4.0,700.0,,,-37.7845,144.8131,6763.0,False,True,True
