In [1]:
import pandas as pd

In [2]:
melbourne = pd.read_csv('melb_data.csv')
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
y=melbourne.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
x=melbourne[melbourne_features]
train_x, test_x, train_y, test_y = train_test_split(x, y, random_state=0)

In [3]:
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_x, test_x, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_x, train_y)
    preds_val = model.predict(test_x)
    mae = mean_absolute_error(test_y, preds_val)
    return mae

In [4]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_x, test_x, train_y, test_y)
    print(f"Max leaf nodes: {max_leaf_nodes} \t\t Mean Absolute Error: {my_mae}")

Max leaf nodes: 5 		 Mean Absolute Error: 354662.43726564094
Max leaf nodes: 50 		 Mean Absolute Error: 266447.02204624057
Max leaf nodes: 500 		 Mean Absolute Error: 231301.17567588817
Max leaf nodes: 5000 		 Mean Absolute Error: 248846.07236606552


In [5]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_x, train_y)
melb_preds = forest_model.predict(test_x)
print(mean_absolute_error(test_y, melb_preds))

180860.37877504269


In [6]:
melbourne = pd.read_csv('melb_data.csv')
melbourne.info()
y = melbourne.Price
melb_predictors = melbourne.drop(['Price'], axis=1)
x = melb_predictors.select_dtypes(exclude=['object'])
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, train_size=0.8, test_size=0.2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [7]:
def score_dataset(x_train, x_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=10, random_state=1)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    return mean_absolute_error(y_test, preds)

In [8]:
cols_missing = [col for col in x_train.columns if x_train[col].isnull().any()]

In [9]:
reduced_x_train = x_train.drop(cols_missing, axis=1)
reduced_x_test = x_test.drop(cols_missing, axis=1)

In [10]:
score_dataset(reduced_x_train, reduced_x_test, y_train, y_test)

183207.24410372396

In [11]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_x_train = pd.DataFrame(my_imputer.fit_transform(x_train))
imputed_x_test = pd.DataFrame(my_imputer.transform(x_test))

score_dataset(imputed_x_train, imputed_x_test, y_train, y_test)

178224.83037029245

In [12]:
x_train_plus =x_train.copy()
x_test_plus = x_test.copy()

for col in cols_missing:
    x_train_plus[col+'_was_missing'] = x_train_plus[col].isnull()
    x_test_plus[col+'_was_missing'] = x_test_plus[col].isnull()

In [13]:
my_imputer = SimpleImputer()
imputed_x_train_plus = pd.DataFrame(my_imputer.fit_transform(x_train_plus))
imputed_x_test_plus = pd.DataFrame(my_imputer.transform(x_test_plus))

In [14]:
score_dataset(imputed_x_train_plus, imputed_x_test_plus, y_train, y_test)

178950.41952275755

In [15]:
melbourne = pd.read_csv('melb_data.csv')
y = melbourne.Price
x = melbourne.drop(['Price'], axis=1)
x_train_full, x_test_full, y_train, y_test  = train_test_split(x, y, random_state=0, train_size=0.8, test_size=0.2)

In [16]:
cols_missing = [col for col in x_train_full.columns if x_train_full[col].isnull().any()]

In [17]:
x_train_full.drop(cols_missing, axis=1, inplace=True)

In [18]:
x_test_full.drop(cols_missing, axis=1, inplace=True)

In [19]:
low_cardinality_cols = [cname for cname in x_train_full.columns if x_train_full[cname].nunique() < 10 and x_train_full[cname].dtype == 'object']

In [20]:
numerical_cols = [cname for cname in x_train_full.columns if x_train_full[cname].dtype in ['int64', 'float64']]

In [21]:
my_cols = low_cardinality_cols + numerical_cols
x_train = x_train_full[my_cols].copy()
x_test = x_test_full[my_cols].copy()

In [22]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10864 entries, 12167 to 2732
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type           10864 non-null  object 
 1   Method         10864 non-null  object 
 2   Regionname     10864 non-null  object 
 3   Rooms          10864 non-null  int64  
 4   Distance       10864 non-null  float64
 5   Postcode       10864 non-null  float64
 6   Bedroom2       10864 non-null  float64
 7   Bathroom       10864 non-null  float64
 8   Landsize       10864 non-null  float64
 9   Lattitude      10864 non-null  float64
 10  Longtitude     10864 non-null  float64
 11  Propertycount  10864 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 1.1+ MB


In [23]:
s = (x_train.dtypes == 'object')
print(s)
object_cols = list(s[s].index)
object_cols

Type              True
Method            True
Regionname        True
Rooms            False
Distance         False
Postcode         False
Bedroom2         False
Bathroom         False
Landsize         False
Lattitude        False
Longtitude       False
Propertycount    False
dtype: bool


['Type', 'Method', 'Regionname']

In [24]:
def score_dataset(x_train, x_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    return mean_absolute_error(y_test, preds)

In [25]:
drop_x_train = x_train.select_dtypes(exclude=['object'])
drop_x_test = x_test.select_dtypes(exclude=['object'])

In [26]:
score_dataset(drop_x_train, drop_x_test, y_train, y_test)

175703.48185157913

In [29]:
from sklearn.preprocessing import OrdinalEncoder
label_x_train = x_train.copy()
label_x_test = x_test.copy()

ordinal_encoder = OrdinalEncoder()
label_x_train[object_cols] = ordinal_encoder.fit_transform(x_train[object_cols])
label_x_test[object_cols] = ordinal_encoder.transform(x_test[object_cols])

In [30]:
score_dataset(label_x_train, label_x_test, y_train, y_test)

165936.40548390493

In [37]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(label_x_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(label_x_test[object_cols]))

In [38]:
OH_cols_train.index = x_train.index
OH_cols_test.index = x_test.index

In [39]:
num_x_train = x_train.drop(object_cols, axis=1)
num_x_test = x_test.drop(object_cols, axis=1)

In [41]:
OH_x_train = pd.concat([num_x_train, OH_cols_train], axis=1)
OH_x_test = pd.concat([num_x_test, OH_cols_test], axis=1)

In [42]:
OH_x_train.columns = OH_x_train.columns.astype(str)
OH_x_test.columns = OH_x_test.columns.astype(str)

In [43]:
score_dataset(OH_x_train, OH_x_test, y_train, y_test)

165564.96689599552