# Model building
### Source data after feature engineering

In [None]:
# source eda file
%run eda_lujain.ipynb

### Prerequisites

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer


  from pandas import MultiIndex, Int64Index


## Data splitting
Split the data into training and testing

In [14]:
# featues
X = cali.drop("median_house_value", axis=1)
X.head()

(17994, 14)

In [15]:
# target variable
Y = cali["median_house_value"]
Y.head()

(17994,)

In [16]:
# split
X_train,X_test,y_train,y_test = train_test_split(X, Y, train_size=0.7, random_state=123)

Since we have a categorical feature in our dataset, we must encode it first before we start the training

In [20]:
# one-hot encode the categorical features

# different columns will be transformed separately and the features generated by each transformer will be concatenated to form a single feature space
preprocessor = ColumnTransformer(
  remainder="passthrough",
  transformers=[
    ("scale", StandardScaler(), selector(dtype_include="number")),
    ("one-hot", OneHotEncoder(), selector(dtype_include="object"))
  ])

# fit the features on the encoder
encoder = preprocessor.fit(X_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

## Model Training
In this section we're going to test two models, XGBRegressor and Random Forest Regressor. We'll train, tune, calculate the accuracy and finally compare the two predictors

### XGBRegressor
XGBRegressor is part of the library XGBoost which stands for "Extreme Gradient Boosting" and it is an implementation of gradient boosting trees algorithm.

In [21]:
xGBR = XGBRegressor().fit(X_train,y_train)
predicted = xGBR.predict(X_test)

### XGBRegressor prediction accuracy

In [24]:
XGBR_score = np.sqrt(mean_squared_error(y_test, predicted))
XGBR_score

47703.19572423537

### Random Forest Regressor
A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

In [25]:
RFR = RandomForestRegressor().fit(X_train,y_train)
predicted = RFR.predict(X_test)

### Random Forest Regressor predicion accuracy

In [None]:
RFR_score = np.sqrt(mean_squared_error(y_test, predicted))
RFR_score

## Hyperparametere tuning

### XGBRegressor parameters
The most commonly configured hyperparameters are the following:
- n_estimators: The number of trees, often increased until no further improvements are seen.
- max_depth: The maximum depth of each tree, often values are between 1 and 10.
- eta: The learning rate used to weight each model, often set to small values such as 0.3, 0.1, 0.01, or smaller.
- subsample: The number of samples (rows) used in each tree, set to a value between 0 and 1, often 1.0 to use all samples.
- colsample_bytree: Number of features (columns) used in each tree, set to a value between 0 and 1, often 1.0 to use all features.

In [None]:
# define model
#XGB_model = XGBRegressor()

# define model evaluation method
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=123)

# define loss function
#loss = 'neg_mean_absolute_error'

# evaluate the model
#scores = cross_val_score(XGB_model, X, Y, scoring=loss, cv=cv, n_jobs=-1)

# get positive scores
#scores = abs(scores)


To find the best hyperparameter combination, we perform grid search.

In [None]:
# define model
XGB_model = XGBRegressor()

# create grid of hyperparameter values
hyper_grid = {
  'n_estimator': (3, 5, 7, 9),
  'max_depth': (4, 5, 6, 7),
  'eta': (0.3, 0.1, 0.001)
  }

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# perform random search
random_search = RandomizedSearchCV(estimator = XGB_model,param_distributions = hyper_grid, 
                n_iter = 100, cv =kfold, n_jobs = -1)
random_search.fit(X_train, y_train)

### Show best estimator

In [None]:
random_search.best_params_

### Use best estimator

In [None]:
best_xgb_model = XGBRegressor(n_estimators =)
best_xgb_model.fit(X_train, y_train)

### Evaluate

In [None]:
y_pred = best_xgb_model.predict(X_test)
print({'actual': y_test, 'predicted': y_pred})

### Random Forest Regressor parameters
The most commonly configured hyperparameters are the following:
- n_estimator: number of trees in the random forest
- max_features: number of features in consideration at every split
- max_depth: maximum number of levels allowed in each decision tree
- min_samples_split: minimum sample number to split a node
- min_sample_leaf: minimum sample number that can be stored in a leaf node
- bootstrap: method used to sample data points

In [None]:
# define model
RFR_model = RandomForestRegressor()

# create grid of hyperparameter values
hyper_grid = {
 'bootstrap': [True, False],
 'max_depth': [10, 20, 30],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [5, 20, 50, 100]
}

# create 5 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# perform random search
random_search = RandomizedSearchCV(estimator = RFR_model,param_distributions = hyper_grid, 
                n_iter = 100, cv =kfold, n_jobs = -1)
random_search.fit(X_train, y_train)
#grid_search = GridSearchCV(RFR_model, hyper_grid, cv=kfold, scoring=loss)
#results = grid_search.fit(X_train_encoded, y_train)

# show the best estimator
#results.best_estimator_

### Show best estimator

In [None]:
random_search.best_params_

### Use best estimator

In [None]:
best_RFR_model = RandomForestRegressor(n_estimators =)
best_RFR_model.fit(X_train, y_train)

### Evaluate

In [None]:
y_pred = best_RFR_model.predict(X_test)
print({'actual': y_test, 'predicted': y_pred})

## Conclusion