In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

---
# Introduction

In [2]:
# Read the Data
full_data = pd.read_csv('train.csv',index_col='Id')
full_test_data = pd.read_csv('test.csv',index_col='Id')

# Obtain Target and Predictors
features = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

y = full_data.SalePrice

x = full_data[features].copy()
test_x = full_test_data[features].copy()

# Break off Validation set from training data
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2,random_state=0)

In [3]:
# Import the model
from sklearn.ensemble import RandomForestRegressor

# Define the Models
model_1 = RandomForestRegressor(n_estimators=50,random_state=0)
model_2 = RandomForestRegressor(n_estimators=100,random_state=0)
model_3 = RandomForestRegressor(n_estimators=100,criterion='absolute_error',random_state=0)
model_4 = RandomForestRegressor(n_estimators=200,min_samples_split=10,random_state=0)
model_5 = RandomForestRegressor(n_estimators=100,max_depth=7,random_state=0)

models = [model_1,model_2,model_3,model_4,model_5]

In [4]:
# Import for Validation
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model,x_tr=x_train,x_te=x_test,y_tr=y_train,y_te=y_test):
    model.fit(x_tr,y_tr)
    yhat = model.predict(x_te)
    return mean_absolute_error(y_te,yhat)

# Loop for Valid each Model
for i in range(0,len(models)):
    mae = score_model(models[i])
    print(f'Model {i+1} MAE: {mae}')

Model 1 MAE: 24015.492818003917
Model 2 MAE: 23740.979228636657
Model 3 MAE: 23528.78421232877
Model 4 MAE: 23874.524641816766
Model 5 MAE: 23706.672864217904


---
# Missing Values

In [5]:
# Load the data
data = pd.read_csv('melb_data.csv')

# Select Target
y = data.Price

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'],axis=1)
x = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [6]:
# Define Function to Measure Quality of Each Approach
def score_dataset(x_train,x_test,y_train,y_test):
    model = RandomForestRegressor(n_estimators=10,random_state=0)
    model.fit(x_train,y_train)
    yhat = model.predict(x_test)
    return mean_absolute_error(y_test,yhat)

- Approach 1 | `Drop` Columns with `Missing Values`

In [7]:
# Get names of columns with missing values
cols_with_missing = [col for col in x_train.columns if x_train[col].isnull().any()]
print('The columns with missing values are:',cols_with_missing)

# Drop columns in training and validation data
reduced_x_train = x_train.drop(cols_with_missing,axis=1)
reduced_x_test = x_test.drop(cols_with_missing,axis=1)

print(f'Approach 1: (Drop columns with Missing Values)\n{score_dataset(reduced_x_train,reduced_x_test,y_train,y_test)})')

The columns with missing values are: ['Car', 'BuildingArea', 'YearBuilt']
Approach 1: (Drop columns with Missing Values)
183550.22137772635)


Approach 2 | `Imputation`

In [8]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_x_train = pd.DataFrame(my_imputer.fit_transform(x_train))
imputed_x_test = pd.DataFrame(my_imputer.fit_transform(x_test))

# Imputation removed column names; put them back
imputed_x_train.columns = x_train.columns
imputed_x_test.columns = x_test.columns

print(f'Approach 2: (Imputation)\n{score_dataset(imputed_x_train,imputed_x_test,y_train,y_test)}')

Approach 2: (Imputation)
179816.89508731329


Approach 3 | `Extension to Imputation`

In [9]:
# Make copy to avoid changing original data
x_train_plus = x_train.copy()
x_test_plus = x_test.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    x_train_plus[col + '_was_missing'] = x_train_plus[col].isnull()
    x_test_plus[col + '_was_missing'] = x_test_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_x_train_plus = pd.DataFrame(my_imputer.fit_transform(x_train_plus))
imputed_x_test_plus = pd.DataFrame(my_imputer.fit_transform(x_test_plus))

print(f'Approach 3: (Extension to Imputation)\n{score_dataset(imputed_x_train_plus,imputed_x_test_plus,y_train,y_test)}')

Approach 3: (Extension to Imputation)
179986.2708570026


---
# Categorical Variable

### Prepare data

In [10]:
# Read the data
data = pd.read_csv('melb_data.csv')

# Get Target and Features
y = data.Price
x = data.drop(['Price'],axis=1)

# Divide data into training and testing
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [11]:
# Geting the name of columns with missing values to drop
cols_with_missing = [col for col in x.columns if x[col].isnull().any()]

# Droping in Training data
x_train.drop(cols_with_missing,axis=1,inplace=True)

# Droping in Testing data
x_test.drop(cols_with_missing,axis=1,inplace=True)

In [12]:
# Select categorical columns with relatively low cardinality
low_cardinality_cols = [cname for cname in x_train.columns if x_train[cname].nunique() < 10 and x_train[cname].dtype == 'object']

# Select numerical columns
numerical_cols = [cname for cname in x_train.columns if x_train[cname].dtype in ['int64','float64']]

In [13]:
# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
x_train = x_train[my_cols].copy()
x_test = x_test[my_cols].copy()

In [14]:
x_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [15]:
# Get list of categorical variable
s = (x_train.dtypes == 'object')
object_cols = list(s[s].index)

print('Categorical variables:')
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


Define function to Measure Quality of Each Approach

In [16]:
# Function for comparing different approaches
def score_dataset(x_train,x_test,y_train,y_test):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(x_train,y_train)
    yhat = model.predict(x_test)
    return mean_absolute_error(y_test,yhat)

### `Approach 1` (Drop Categorical Variables)

In [17]:
drop_x_train = x_train.select_dtypes(exclude='object')
drop_x_test = x_test.select_dtypes(exclude='object')

print('MAE from Approach 1 (Drop Categorical Variables):')
print(score_dataset(drop_x_train,drop_x_test,y_train,y_test))

MAE from Approach 1 (Drop Categorical Variables):
175703.48185157913


### `Aprroach 2` (Ordinal Encoding)

In [18]:
from sklearn.preprocessing import OrdinalEncoder

In [19]:
# Make copy to avoid changing original data
label_x_train = x_train.copy()
label_x_test = x_test.copy()

# Apply Ordinal Encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_x_train[object_cols] = ordinal_encoder.fit_transform(x_train[object_cols])
label_x_test[object_cols] = ordinal_encoder.transform(x_test[object_cols])

# 
print('MAE from Approach 2 (Ordinal Encoding):')
print(score_dataset(label_x_train,label_x_test,y_train,y_test))

MAE from Approach 2 (Ordinal Encoding):
165936.40548390493


### `Approach 3` (One-Hot Encoding)

In [29]:
from sklearn.preprocessing import OneHotEncoder

Apply <b>One-Hot Encoder</b> to each column with categorical data

In [35]:
# handle_unknown = avoid : to avoid errors
# sparse=False ensures that the encoded columns are returnet as a numpy array

OH_encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False) 

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(x_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(x_test[object_cols]))

# One-Hot Encoding removed index; put it back
OH_cols_train.index = x_train.index
OH_cols_test.index = x_test.index

# Remove Categorical data columns (will replace with one-hot encoding)
num_x_train = x_train.drop(object_cols,axis=1)
num_x_test = x_test.drop(object_cols,axis=1)

# Add one-hot encoded columns to numerical features
OH_x_train = pd.concat([num_x_train,OH_cols_train],axis=1)
OH_x_test = pd.concat([num_x_test,OH_cols_test],axis=1)

# Ensure all columns have string type
OH_x_train.columns = OH_x_train.columns.astype('str')
OH_x_test.columns = OH_x_test.columns.astype('str')

print('MAE from Approach 3 (One-Hot Encoding):')
print(score_dataset(OH_x_train,OH_x_test,y_train,y_test))

MAE from Approach 3 (One-Hot Encoding):
166089.4893009678
