# Modeling & Evaluation

In [51]:
// imports
% use dataframe
USE {
    dependencies {
        implementation("com.github.haifengl:smile-kotlin:3.1.1")
    }
}

## Modeling

In [7]:
import smile.regression.randomForest;
import smile.validation.metric.RMSE;
import smile.data.formula.Formula;

In [4]:
val trainData = smile.read.csv("../data/processed/housing_train.csv")
val testData = smile.read.csv("../data/processed/housing_test.csv")

In [9]:
val model = randomForest(
    Formula.lhs("median_house_value"), 
    trainData
)

## Evaluation

In [10]:
val trainPreds = model.predict(trainData.drop("median_house_value"))

In [11]:
val trainActual = trainData.column("median_house_value").toDoubleArray()

In [12]:
val trainRMSE = RMSE.of(trainActual, trainPreds)
println("Train RMSE: $trainRMSE")

Train RMSE: 39601.476159908605


In [13]:
val testPreds = model.predict(testData.drop("median_house_value"))

In [14]:
val testActual = testData.column("median_house_value").toDoubleArray()

In [15]:
val testRMSE = RMSE.of(testActual, testPreds)
println("Train RMSE: $testRMSE")

Train RMSE: 48861.98249106867


## Feature Importance

In [101]:
val df = dataFrameOf(
    "feature" to trainData.drop("median_house_value").names().toList(),
    "importance" to model.importance().toList()
)

In [103]:
df.sortBy {importance.desc()}

feature,importance
median_income,3.9885940953859016e+16
ocean_proximity_INLAND,1.4991606795965346e+16
longitude,1.175700355607063e+16
latitude,1.0512846805515288e+16
housing_median_age,3515038779570896.0
population,3418711271524152.0
total_rooms,3048233297071431.0
ocean_proximity_<1H OCEAN,2286908718989026.5
total_bedrooms,1825183965144728.5
households,1674012982083221.5
