# Required Libraries

In [150]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [106]:
train_data = pd.read_csv("train.csv", index_col="Id")
test_data = pd.read_csv("test.csv", index_col="Id")

In [107]:
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [108]:
test_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


# Selecting target and features

In [109]:
y_train = train_data.SalePrice


In [110]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
x_train  = train_data[features].copy()
x_test = test_data[features].copy()

# Selecting validation data

In [111]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, test_size=0.2, random_state=0)


# Model
we will use different versions of RandomForest model.

In [112]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [113]:
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion="absolute_error", random_state=0)
model_4 = RandomForestRegressor(n_estimators=100, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7,random_state=0)


In [114]:
models = [model_1, model_2, model_3, model_4, model_5]

We will define a function to estimate the best model based on the mean absolute erro. The less the error, the best the model.

In [115]:
def model_score(model, x_t, x_val, y_t, y_val):
    model.fit(x_t, y_t)
    preds = model.predict(x_val)
    return mean_absolute_error(y_val,preds)
for i in range(0, len(models)):
    mae = model_score(models[i], x_train, x_val,y_train, y_val)
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23859
Model 5 MAE: 23706


The results shows that model 3 has the best model mae

In [116]:
best_model = model_3

In [117]:
best_model.fit(x_train, y_train)

In [118]:
preds = best_model.predict(x_test)

In [119]:
output = pd.DataFrame({'Id': x_test.index,
                       'SalePrice': preds})
output.to_csv('submission.csv', index=False)

# Missing values

In [120]:
data = pd.read_csv("melb_data.csv")

In [121]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [122]:
y = data.Price

In [123]:
predictors = data.drop(["Price"], axis=1)
x = predictors.select_dtypes(exclude=['object'])

In [124]:
x_train, x_val, y_train, y_val = train_test_split(x,y, train_size=0.8, test_size=0.2, random_state=0)

In [125]:
x_train.shape

(10864, 12)

In [126]:
x_val.shape

(2716, 12)

In [127]:
def model_score(x_t, x_val, y_t, y_val):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(x_t, y_t)
    preds = model.predict(x_val)
    return mean_absolute_error(y_val,preds)


### Approch-1: Drop the column with missing values
Since we are working with both training and validation data, we have to drop columns from both datasets.

In [128]:
#columns with missing values
cols_with_missing = [col for col in x_train.columns
                     if x_train[col].isnull().any()]

In [129]:
x_train_modified = x_train.drop(cols_with_missing, axis=1)
x_val_modified = x_val.drop(cols_with_missing, axis=1)

In [130]:
print("MAE from approach-1 (cols_with_missing_values)")
print(model_score(x_train_modified, x_val_modified, y_train, y_val))

MAE from approach-1 (cols_with_missing_values)
183550.22137772635


# Approach-2 imputation

In [131]:
from sklearn.impute import SimpleImputer

In [132]:
imputer = SimpleImputer()
x_train_imputed = pd.DataFrame(imputer.fit_transform(x_train))
x_val_imputed  = pd.DataFrame(imputer.fit_transform(x_val))

x_train_imputed.columns = x_train.columns
x_val_imputed.columns = x_val.columns

# Approach-3 Extension of imputation value
we will impute the missing value and also want to keep the track of missing values.

In [133]:
#make a copy of original data

x_train_plus = x_train.copy()
x_val_plus = x_val.copy()

#columns with missing values

for col in cols_with_missing:
    x_train_plus[col + "_was_missing"] = x_train_plus[col].isnull()
    x_val_plus[col + "_was_missing"] = x_val_plus[col].isnull()

#imputation

imputer = SimpleImputer()
x_train_plus_imputed = pd.DataFrame(imputer.fit_transform(x_train_plus))
x_val_plus_imputed = pd.DataFrame(imputer.fit_transform(x_val_plus))

print("MAE with extended imputation")
print(model_score(x_train_plus_imputed, x_val_plus_imputed, y_train, y_val))

MAE with extended imputation
179986.2708570026


# Categorical features

1. Drop columns with categorical featues

In [134]:
x_drop_train = x_train.select_dtypes(exclude=["object"])
x_drop_valid = x_val.select_dtypes(exclude=["object"])
x_drop_train = x_drop_train.drop(cols_with_missing, axis=1)
x_drop_valid = x_drop_valid.drop(cols_with_missing, axis=1)

In [135]:
x_drop_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10864 entries, 12167 to 2732
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          10864 non-null  int64  
 1   Distance       10864 non-null  float64
 2   Postcode       10864 non-null  int64  
 3   Bedroom2       10864 non-null  int64  
 4   Bathroom       10864 non-null  int64  
 5   Landsize       10864 non-null  int64  
 6   Lattitude      10864 non-null  float64
 7   Longtitude     10864 non-null  float64
 8   Propertycount  10864 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 848.8 KB


As it is clear from the information, there is no object or text type variable.

In [136]:
print("MAE for this approach: ", model_score(x_drop_train, x_drop_valid, y_train, y_val))

MAE for this approach:  183550.22137772635


In [137]:
x_drop_train.isnull().sum()

Rooms            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Landsize         0
Lattitude        0
Longtitude       0
Propertycount    0
dtype: int64

# Ordinal encoding
Let's find out what values we have for condition column (categorical column)

In [138]:
X = pd.read_csv('train.csv', index_col='Id') 
X_test = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [139]:
print("Unique values in condition column for train data", X_train["Condition2"].unique())
print("Unique values in condition column for test data", X_test["Condition2"].unique())


Unique values in condition column for train data ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']
Unique values in condition column for test data ['Norm' 'Feedr' 'PosA' 'PosN' 'Artery']


As you see ther are two values or "Artery" and "RRAe" in condition column for training data but not present in test data.

In [140]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Functional', 'RoofMatl', 'Condition2']


In [141]:
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply ordinal encoder
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

In [142]:
label_X_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,3.0,11694,1.0,3.0,3.0,0.0,4.0,0.0,16.0,...,108,0,0,260,0,0,7,2007,6.0,5.0
871,20,3.0,6600,1.0,3.0,3.0,0.0,4.0,0.0,12.0,...,0,0,0,0,0,0,8,2009,8.0,4.0
93,30,3.0,13360,1.0,0.0,1.0,0.0,4.0,0.0,6.0,...,0,44,0,0,0,0,8,2009,8.0,4.0
818,20,3.0,13265,1.0,0.0,3.0,0.0,1.0,0.0,11.0,...,59,0,0,0,0,0,7,2008,8.0,4.0
303,20,3.0,13704,1.0,0.0,3.0,0.0,0.0,0.0,5.0,...,81,0,0,0,0,0,1,2006,8.0,4.0


In [143]:
print("MAE from Approach 2 (Ordinal Encoding):") 
print(model_score(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
18262.22853881279


In [144]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

# one-hot encoding

For large datasets with many rows, one-hot encoding can greatly expand the size of the dataset. For this reason, we typically will only one-hot encode columns with relatively low cardinality. Then, high cardinality columns can either be dropped from the dataset, or we can use ordinal encoding.

In [145]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Exterior1st', 'Exterior2nd', 'Neighborhood']


In [146]:
from sklearn.preprocessing import OneHotEncoder

In [147]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [151]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(model_score(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
18221.26198630137


# Machine Learning Pipeline
1. Cleaner Code
2. Fewer bugs chances
3. More options for model validation

We can construct the full pipeline in three steps
1. Define preprocessing steps
2. Define the model
3. Create and evaluate the pipeline

In [152]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

### Define Preprocessing Steps

In [153]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [155]:
#preprocessing for numerical data

numerical_encoder = SimpleImputer(strategy='constant')

#preprocessing for categorical data

categorical_encoder = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

#bundles both preprocessing

preprocessor = ColumnTransformer(
    transformers= [
        ("num", numerical_encoder, numerical_cols),
        ("cat", categorical_encoder, categorical_cols)
    ])



### Define the model

In [156]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

### Create and evaluate the pipeline

In [157]:
from sklearn.metrics import mean_absolute_error

my_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

#preprocessing of training data
my_pipeline.fit(X_train, y_train)

#predictions
preds = my_pipeline.predict(X_valid)

score = mean_absolute_error(y_valid, preds)

print("MAE: ", score)


MAE:  160679.18917034855
