# Preprocessing

In [1]:
// imports
%use dataframe
% use smile

## Imputation and Encoding

We will

- Impute missing numerical features with the median value 
- Encode categorical features using one hot encoding

In [7]:
import smile.feature.imputation.SimpleImputer;
import smile.data.CategoricalEncoder;
import smile.data.DataFrame as smileDataFrame;

In [3]:
val housing = read.csv("../data/raw/housing.csv")

In [4]:
val imputer = SimpleImputer.fit(housing)
val completeData = imputer.apply(housing)

In [5]:
completeData

[longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double, ocean_proximity: String]
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|                41|        880|           129|       322|       126|       8.3252|            452600|       NEAR BAY|
|  -122.22|   37.86|                21|       7099|          1106|      2401|      1138|       8.3014|            358500|       NEAR BAY|
|  -122.24|   37.85|                52|       1467|           190|       496|      

In [8]:
val oneHotEncodedData = completeData
.factorize("ocean_proximity")
.toMatrix(false, CategoricalEncoder.ONE_HOT, null)

val oneHotEncodedFrame = smileDataFrame.of(oneHotEncodedData.toArray(), *oneHotEncodedData.colNames())

In [9]:
write.csv(oneHotEncodedFrame, "../data/processed/housing.csv")

## Train/Test Split

In [10]:
val housing = DataFrame.read("../data/processed/housing.csv")

In [11]:
val numSamples = housing.rowsCount()
val indices = (0..numSamples-1).toList()
val indicesShuffled=  indices.shuffled()
val testRatio = 0.2
val numTrain = ((1 - testRatio) * numSamples).toInt()
val trainIndices = indicesShuffled.slice(0..numTrain)
val testIndices = indicesShuffled.slice(numTrain..numSamples-1)

In [12]:
val trainData = housing[trainIndices]
val testData = housing[testIndices]

In [13]:
trainData.writeCSV("../data/processed/housing_train.csv")
testData.writeCSV("../data/processed/housing_test.csv")