# Goal: 
Build Classification model and predict median house value

# Import Packages and load dataset

In [0]:
%python
%pip install hyperopt

In [0]:
%python
%restart_python

In [0]:
import numpy as np

from sklearn.datasets import fetch_california_housing
#Evaluate the performance of ML model
from sklearn.model_selection import cross_val_score 

#Classification algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

import mlflow

from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials


In [0]:
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

In [0]:
print(X)

In [0]:
print(y)

**Dataset Information**
Number of Instances: 20640

**Number of Attributes:** 8 numeric, predictive attributes and the target

**Attribute Information:**

MedInc median income in block

HouseAge median house age in block

AveRooms average number of rooms

AveBedrms average number of bedrooms

Population block population

AveOccup average house occupancy

Latitude house block latitude

Longitude house block longitude

Missing Attribute Values: None



**_The target variable is the median house value for California districts._**

# Feature Engineering

## Scale the features / predictor values

**Why we need to do scaling?**

The predictor columns are median income, house age, average number of rooms in a house, average number of bedrooms, block population, average house occupancy, latitude, and longitude. The ranges of these predictors varies significantly. Block population is in the thousands, but the average number of rooms in a house is around 5. To prevent the predictors with large values from dominating the calculations, it's a good idea to normalize the predictor values so they are all on the same scale. To do this, you can use the scikit-learn function StandardScaler.

In [0]:
#Before Scaling
X.mean(axis=0)

In [0]:
#Apply scaling
from sklearn.preprocessing import StandardScaler 
#Create an instance (scaler) for the class StandaedScaler
scaler = StandardScaler() 
#Apply the scaler to the data (predictor values)
#Here X contains predictor values and y contains target. So, apply scaling on X
X = scaler.fit_transform(X)

In [0]:
#After Scaling
# After scaling, the mean value for each column is close to 0. 
X.mean(axis=0)

## Convert Numeric target column to discrete column 

The target value in this dataset is the value of the house, a continuous or numeric value. This notebook illustrates the use of classification functions, so the first step is to convert the target value to a categorical value. The next cell converts the original target values into two discrete levels: 0 if the value of the house is below the median, or 1 if the value of the house is above the median. 

In classification problems the target variable will be **_categorical_** (Yes or No)

In [0]:
print(y) #the output is a continuous/ numeric value

In [0]:
#Converting the numeric to discrete

y_discrete = np.where(y < np.mean(y), 0, 1)
print(y_discrete)
#Now the output have only 2 values 1 and 0.

# Hyperopt workflow

## 1. Define function to minimize

In [0]:
def objective(param):
    classifier_type = param['classifier_type']
    del param['classifier_type']
    #Build Classification models
    if classifier_type == 'rf':
        clf = RandomForestRegressor(**param)
    elif classifier_type == 'lr':
        clf = LinearRegression(**param)
    elif classifier_type == 'svm':
        clf = SVC(**param)
    else:
        return 0
    #Use Cross Validation to estimate the performance of the model
    accuracy = cross_val_score(clf, X, y_discrete).mean()

    #fmin function returns a dictionary
    return {'loss': -accuracy, 'status': STATUS_OK}
    

## 2. Define search space over hyperparameter

In [0]:
#We are giving choice, to select best performing model
search_space = hp.choice('classifier_type', [
    {
        'classifier_type': 'rf',
        'max_depth': hp.randint('max_depth',10),
        'criterion': hp.choice('criterion', ['squared_error', 'friedman_mse']),
    },
    {
        'classifier_type': 'logreg',
        'C': hp.lognormal('LR_C', 0, 1.0),
        'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
    },
    {
        'classifier_type': 'svm',
        'C': hp.lognormal('SVM_C',0,1),
        'kernel': hp.choice('kernel', ['linear', 'poly']),
    }
])

## 3. Select Search algorithm

The two main choices are:

**hyperopt.tpe.suggest:** Tree of Parzen Estimators, a Bayesian approach that iteratively and adaptively selects new hyperparameter settings to explore based on previous results

**hyperopt.rand.suggest:** Random search, a non-adaptive approach that samples over the search space

In [0]:
algo = tpe.suggest

## 4. Run the tuning algorithm with Hyperopt fmin()

SparkTrials takes 2 optional arguments:

**parallelism:** Number of models to fit and evaluate concurrently. The default is the number of available Spark task slots.

**timeout:** Maximum time (in seconds) that fmin() can run. The default is no maximum time limit.

In [0]:
%python
#Not supported in this cluster
from hyperopt import SparkTrials
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.getOrCreate()

# Create an instance (spark_trials) for the class SparkTrials without specifying the Spark session
spark_trials = SparkTrials()

In [0]:
with mlflow.start_run(): #By calling this function, we enable MLflow to track the run.
    best_results = fmin(
        fn = objective,
        space = search_space,
        algo = algo,
        max_evals = 10
    )

## 5. Print the hyperparameters that produced the best result

In [0]:
print("Best value found:", best_results)

In [0]:
#Another method
import hyperopt
print(hyperopt.space_eval(search_space, best_results))