# California Housing Regression Machine Learning

In [1]:
import pandas as pd
import numpy as np
from numpy import mean
import lightgbm as lgb
from lightgbm import LGBMRegressor
from verstack import LGBMTuner
from matplotlib import pyplot as plt
from sklearn.preprocessing import PowerTransformer,StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.datasets import fetch_california_housing

Read input data

In [2]:
train = pd.read_csv(r'./input_data/train_extended.csv')
test = pd.read_csv(r'./input_data/test.csv')

### Assign the Isolation Forest model

In [3]:
clf = IsolationForest(contamination =0.05,max_samples=0.7 ,random_state=0).fit(train)
OD = clf.predict(train.values)
Outlier_rows = []

Delete outliers

In [4]:
for i in range(train.shape[0]):
    if OD[i] == -1:
        Outlier_rows.append(i)
train = train.drop(Outlier_rows)
train = train.reset_index(drop=True)
train.drop(train.columns[0],axis=1,inplace=True)
train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336
4,6.8075,26.0,6.764372,1.091787,2147.0,2.70354,33.84,-118.31,3.714


Drop test id column

In [5]:
test.drop(['id'],axis=1,inplace=True)
test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,1.7062,35.0,4.966368,1.096539,1318.0,2.844411,39.75,-121.85
1,1.3882,22.0,4.187035,1.098229,2296.0,3.180218,33.95,-118.29
2,7.7197,21.0,7.129436,0.959276,1535.0,2.888889,33.61,-117.81
3,4.6806,49.0,4.769697,1.048485,707.0,1.74359,34.17,-118.34
4,3.1284,25.0,3.765306,1.081633,4716.0,2.003827,34.17,-118.29


### Triggering the LGBMTuner

This module tunes the model automatically. For getting stable prediction, we run the LGBMTuner multiple times and return their mean as the final prediction.


In [6]:
def stable_prediction(n_trials):
    
    predictions = pd.DataFrame(columns = [i for i in range(n_trials)])
    
    for trial in range(n_trials):

        X = train.values[:,:-1]
        Y = train.values[:,-1]
        
        # the only required argument
        tuner = LGBMTuner(metric = 'rmse',trials = 30,seed = 13)
        #tuner = LGBMTuner(metric = 'rmse',trials = 150,seed = 13)

        #the tuner needs these datatype for X and Y
        X = pd.DataFrame(X)
        Y = pd.Series(Y)
        tuner.fit(X,Y)
        test_df = pd.DataFrame(test.values[:,:-1])
        predicted = tuner.predict(test_df)

        predictions[trial] = predicted
        
    Mean_Prediction = []
    
    for i in range(predictions.shape[0]):
        
        row = predictions.iloc[i].values.tolist()
        Mean = mean(row)
        Mean_Prediction.append(Mean)
    
    return Mean_Prediction,predictions

In [7]:
Pre = stable_prediction(n_trials=6)


 * Initiating LGBMTuner.fit
     . Settings:
     .. Trying 30 trials
     .. Evaluation metric: rmse 
     .. Study direction: minimize rmse

     . Trial number: 0 finished
     .. Optimization score (lower-better): rmse: 0.5045329460484543
 ...........................................................................
     . Trial number: 1 finished
     .. Optimization score (lower-better): rmse: 0.5068339950268465
 ...........................................................................


KeyboardInterrupt: 