In [1]:
import pandas as pd

data=pd.read_csv('melb_data.csv')

In [2]:
data.head(2)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
y=data.Price
X=data.drop(['Price'], axis=1)

In [5]:
# dividng data into train and validation data

X_train_full,X_valid_full,y_train,y_valid=train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

1. Approach - dropping missing columns

In [10]:
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)
#dropping columns with missing values

* "Cardinality" means the number of unique values in a column
* Select categorical columns with relatively low cardinality (convenient but arbitrary)

In [11]:
low_cardinality_cols= [cname for cname in X_train_full.columns if 
                      X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

We took all columns (nunique() function return Series with number of distinct observations over requested axis) that have less then 10 different observations
and that type of these columns is object

In [12]:
# now we want to find numerical columns

In [13]:
numerical_cols=[cname for cname in X_train_full.columns if 
               X_train_full[cname].dtype in ['int64','float64']]

In [14]:
# now we want to keep only selected columns

In [15]:
cols=low_cardinality_cols + numerical_cols
X_train=X_train_full[cols].copy()
X_valid=X_valid_full[cols].copy()

In [16]:
# now we want to see train set

In [17]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


Now we have list of categorical variables in training data. We can check if we have object type columns with next code:

In [19]:
s=(X_train.dtypes=='object')
object_cols=list(s[s].index) # spremili smo listu imena stupaca koji su type obect

print('Categorical variables: ')
print(object_cols)

Categorical variables: 
['Type', 'Method', 'Regionname']


Now we want to define score_dataset function that measures mean absolute error.

Error that we want to have lower as possible.

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset (X_train, X_valid, y_train, y_valid):
    model=RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train,y_train)
    preds=model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# 1. Approach - drop categorical var

In [21]:
drop_X_train=X_train.select_dtypes(exclude=['object'])
drop_X_valid=X_valid.select_dtypes(exclude=['object'])

In [22]:
print('MAE for Dropping categorical var: ')
print(score_dataset(drop_X_train,drop_X_valid, y_train, y_valid))

MAE for Dropping categorical var: 
175703.48185157913


# 2. Approach - Label Encoding

In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
# making copy of data because we dont want to change our data

In [25]:
label_X_train=X_train.copy()
label_X_valid=X_valid.copy()

In [26]:
label_encoder=LabelEncoder()

In [28]:
for col in object_cols: #uzimamo object type stupce i pretvaramo podatke
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [29]:
print('MAE for Label Encoding: ')
print(score_dataset(label_X_train,label_X_valid, y_train, y_valid))

MAE for Label Encoding: 
165936.40548390493


In [30]:
# if we look at label_X_train andX_train we will see what LE did

In [31]:
label_X_train.head(2)

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,2,1,5,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,0,2,6,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0


In [32]:
X_train.head(2)

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0


# 3. Approach - One-Hot Encoding

We use the OneHotEncoder class from scikit-learn to get one-hot encodings. There are a number of parameters that can be used to customize its behavior.

* We set handle_unknown='ignore' to avoid errors when the validation data contains classes that aren't represented in the training data, and
* setting sparse=False ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).

To use the encoder, we supply only the categorical columns that we want to be one-hot encoded. For instance, to encode the training data, we supply X_train[object_cols]. (object_cols in the code cell below is a list of the column names with categorical data, and so X_train[object_cols] contains all of the categorical data in the training set.)

In [33]:
from sklearn.preprocessing import OneHotEncoder


In [34]:
OH_encoder=OneHotEncoder(handle_unknown='ignore', sparse=False)

In [35]:
OH_cols_train=pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid=pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

In [36]:
# when we implemented OH encoder he removed index now we need to put it back

In [37]:
OH_cols_train.index=X_train.index
OH_cols_valid.index=X_valid.index

In [39]:
# first we need to copy data to new var without categorical var so that we can later past OH encoded data
num_X_train=X_train.drop(object_cols, axis=1)
num_X_valid=X_valid.drop(object_cols, axis=1)

In [40]:
# now we are adding encoded columns to data

In [41]:
OH_X_train=pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid=pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [42]:
print('MAE for OH Encoder: ')
print(score_dataset(OH_X_train,OH_X_valid, y_train, y_valid))

MAE for OH Encoder: 
166089.4893009678


we can see that in this case first approach (dropping columns) is the worst
while other 2 approaches are pretty close.