In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv', index_col='Id')
Xtest = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv', index_col='Id')

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

X = df.copy()
y = X.pop('SalePrice')

Xtrain, Xvalid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [None]:
nul_col = Xtrain.columns[Xtrain.isnull().sum()>0].tolist()

# Drop columns with more than half nulls

In [None]:
high_nul = Xtrain.columns[Xtrain.isnull().sum()> Xtrain.shape[0]/2].tolist()

X_train = Xtrain.drop(high_nul, axis=1)
X_valid = Xvalid.drop(high_nul, axis=1)
X_test  = Xtest .drop(high_nul, axis=1)

assert X_train.shape[1] == X_valid.shape[1] and X_train.shape[1] == X_test.shape[1]

# Impute null values in numerical and categorical columns separately

In [None]:
from sklearn.impute import SimpleImputer

si_num = SimpleImputer(strategy='mean')
si_obj = SimpleImputer(strategy='most_frequent')

num_cols = X_train.select_dtypes('number').columns.tolist()
obj_cols = X_train.select_dtypes('object').columns.tolist()

X_train[num_cols] = si_num.fit_transform(X_train[num_cols])
X_train[obj_cols] = si_obj.fit_transform(X_train[obj_cols])

X_valid[num_cols] = si_num.transform(X_valid[num_cols])
X_valid[obj_cols] = si_obj.transform(X_valid[obj_cols])

X_test[num_cols] = si_num.transform(X_test[num_cols])
X_test[obj_cols] = si_obj.transform(X_test[obj_cols])

assert X_train.isnull().sum().sum() == 0
assert X_valid.isnull().sum().sum() == 0
assert X_test.isnull().sum().sum() == 0

# Drop categorical columns with more than 10 unique values

In [None]:
high_unq = X_train[obj_cols].columns[X_train[obj_cols].nunique()>10].tolist()

X_train.drop(high_unq, axis=1, inplace=True)
X_valid.drop(high_unq, axis=1, inplace=True)
X_test.drop(high_unq, axis=1, inplace=True)

# Converting categorical data types into One-Hot

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_col = X_train.select_dtypes('object').columns.tolist()
num_col = X_train.select_dtypes('number').columns.tolist()

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train_oh = pd.DataFrame(ohe.fit_transform(X_train[cat_col]))
X_valid_oh = pd.DataFrame(ohe.transform(X_valid[cat_col]))
X_test_oh =  pd.DataFrame(ohe.transform(X_test[cat_col]))

X_train_oh.index = X_train.index
X_valid_oh.index = X_valid.index
X_test_oh .index = X_test .index

X1_train = pd.concat([X_train[num_col], X_train_oh], axis=1)
X1_valid = pd.concat([X_valid[num_col], X_valid_oh], axis=1)
X1_test  = pd.concat([X_test[num_col] , X_test_oh ], axis=1)

# Remove any left over categorical columns
in case that one-hot encoder is not used

In [None]:
cat_col = X1_train.select_dtypes('object').columns.tolist()
print(cat_col)

X1_train.drop(cat_col, axis=1, inplace=True)
X1_valid.drop(cat_col, axis=1, inplace=True)
X1_test.drop(cat_col, axis=1, inplace=True)

# Model 1: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor()

dt_model.fit(X1_train, y_train)
y_val_pred = dt_model.predict(X1_valid)
y_tst_pred = dt_model.predict(X1_test)

val_error = abs(y_val_pred - y_valid).mean()
print(val_error)

# Model 2: Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr_model = RandomForestRegressor()

rfr_model.fit(X1_train, y_train)
y2_val_pred = rfr_model.predict(X1_valid)
y2_tst_pred = rfr_model.predict(X1_test)

val2_error = abs(y2_val_pred - y_valid).mean()
print(val2_error)

# Model 3: Random Forest with defined parameters

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr_model = RandomForestRegressor(n_estimators=500, criterion='mae', max_depth=500, min_samples_leaf=2)

rfr_model.fit(X1_train, y_train)
y3_val_pred = rfr_model.predict(X1_valid)
y3_tst_pred = rfr_model.predict(X1_test)

val3_error = abs(y3_val_pred - y_valid).mean()
print(val3_error)

In [None]:
y3_trn_pred = rfr_model.predict(X1_train)
trn3_error = abs(y3_trn_pred - y_train).mean()
print(trn3_error)