# Intermediate Machine Learning course from kaggle

## House prices model

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

house_data = pd.read_csv('./train.csv', index_col='Id')
house_test = pd.read_csv('./test.csv', index_col='Id')

In [3]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

y = house_data.SalePrice.copy()
X = house_data[features].copy()
X_test = house_test[features].copy()

# split training data:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [4]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


### Random forest models:

In [5]:
from sklearn.ensemble import RandomForestRegressor

model1 = RandomForestRegressor(n_estimators=50, random_state=0)
model2 = RandomForestRegressor(n_estimators=100, random_state=0)
model3 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
model4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model1, model2, model3, model4, model5]

In [6]:
from sklearn.metrics import mean_absolute_error

def score_model(model, Xt=X_train, Xv=X_val, yt=y_train, yv=y_val):
    model.fit(Xt, yt)
    predictions = model.predict(Xv)
    return mean_absolute_error(yv, predictions)

i=1
for model in models:
    mae = score_model(model)
    print(f'Model {i} MAE: {mae}')
    i+=1

Model 1 MAE: 24015.492818003917
Model 2 MAE: 23740.979228636657
Model 3 MAE: 23528.78421232877
Model 4 MAE: 23996.676789668687
Model 5 MAE: 23706.672864217904


In [13]:
missing_val_count = house_data.isnull().sum()
missing_val_count[missing_val_count>0].index.tolist()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [14]:
print(house_data.shape)
missing_val_count = house_data.isnull().sum()
missing_val_count[missing_val_count>0]

(1460, 80)


LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [16]:
X_full = house_data.drop(['SalePrice'], axis=1)

In [20]:
X = X_full.select_dtypes(exclude=['object'])
X_test = house_test.select_dtypes(exclude=['object'])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [27]:
print(X_train.shape)

missing_val_count = X_train.isnull().sum()
missing_val_count[missing_val_count>0]

(1168, 36)


LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64

In [51]:
X_train['LotFrontage'].fillna(round(X_train['LotFrontage'].mean()), inplace=True)
X_train['MasVnrArea'].fillna(round(X_train['MasVnrArea'].mean()), inplace=True)
X_train['GarageYrBlt'].fillna(round(X_train['GarageYrBlt'].mean()), inplace=True)
X_val['LotFrontage'].fillna(round(X_val['LotFrontage'].mean()), inplace=True)
X_val['MasVnrArea'].fillna(round(X_val['MasVnrArea'].mean()), inplace=True)
X_val['GarageYrBlt'].fillna(round(X_val['GarageYrBlt'].mean()), inplace=True)

In [42]:
missing_val_count = X_train.isnull().sum()
missing_val_count[missing_val_count>0]

Series([], dtype: int64)

In [53]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

score_dataset(X_train, X_val, y_train, y_val)

17886.26234018265

In [16]:
missing_val_count = X_test.isnull().sum()
missing_val_count[missing_val_count>0]

Series([], dtype: int64)

In [19]:
final_X_test = X_test

final_model = RandomForestRegressor(n_estimators=100, random_state=0)
final_model.fit(X, y)
preds_test = final_model.predict(X_test)

In [20]:
mean_absolute_error(y, preds_test)

ValueError: Found input variables with inconsistent numbers of samples: [1460, 1459]