# Model building
### Get clean features

In [None]:
# source eda file
%run eda_lujain.ipynb

### Prerequisites

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score


## Data splitting
Split the data into training and testing

In [14]:
# featues
X = cali.drop("median_house_value", axis=1)
X.head()

(17994, 14)

In [15]:
# target variable
Y = cali["median_house_value"]
Y.head()

(17994,)

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X, Y, train_size=0.7, random_state=123)

## Model Training

### XGBRegressor
XGBRegressor is part of the library XGBoost which stands for "Extreme Gradient Boosting" and it is an implementation of gradient boosting trees algorithm.

In [None]:
xGBR = XGBRegressor().fit(X_train,y_train)
predicted = xGBR.predict(X_test)


### XGBRegressor prediction accuracy

In [None]:
score = np.sqrt(mean_squared_error(y_test, predicted))

### Random Forest Regressor
A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

In [None]:
RFR = RandomForestRegressor().fit(X_train,y_train)
predicted = RFR.predict(X_test)

### Random Forest Regressor predicion accuracy

In [None]:
score = np.sqrt(mean_squared_error(y_test, predicted))

## Hyperparametere tuning
First we perform standardization on the training set

In [None]:
preprocessor = ColumnTransformer(
  remainder="passthrough",
  transformers=[
    ("scale", StandardScaler(), selector(dtype_include="number")),
    ("one-hot", OneHotEncoder(), selector(dtype_include="object"))
  ])

X_train_encoded = preprocessor.fit_transform(X_train)

### XGBRegressor parameters
The most commonly configured hyperparameters are the following:
- n_estimators: The number of trees, often increased until no further improvements are seen.
- max_depth: The maximum depth of each tree, often values are between 1 and 10.
- eta: The learning rate used to weight each model, often set to small values such as 0.3, 0.1, 0.01, or smaller.
- subsample: The number of samples (rows) used in each tree, set to a value between 0 and 1, often 1.0 to use all samples.
- colsample_bytree: Number of features (columns) used in each tree, set to a value between 0 and 1, often 1.0 to use all features.

In [None]:
# define model
XGB_model = XGBRegressor()

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=123)

# define loss function
loss = 'neg_mean_absolute_error'

# evaluate the model
scores = cross_val_score(XGB_model, X_test, y_test, scoring=loss, cv=cv, n_jobs=-1)

# get positive scores
scores = abs(scores)


To find the best hyperparameter combination, we perform grid search.

In [None]:
# create grid of hyperparameter values
hyper_grid = {
  'n_estimator': (3, 5, 7, 9),
  'max_depth': (4, 5, 6, 7),
  'eta': (0.3, 0.1, 0.001)
  }

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# perform grid search
grid_search = GridSearchCV(XGB_model, hyper_grid, cv=kfold, scoring=loss)
results = grid_search.fit(X_train_encoded, y_train)

# show the best estimator
results.best_estimator_

### Random Forest Regressor parameters
The most commonly configured hyperparameters are the following:

In [None]:
# define model
RFR_model = RandomForestRegressor()

hyper_grid = {
 'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# create 5 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# perform grid search
grid_search = GridSearchCV(RFR_model, hyper_grid, cv=kfold, scoring=loss)
results = grid_search.fit(X_train_encoded, y_train)

# show the best estimator
results.best_estimator_

## Conclusion