# Exploratory Data Analysis

In [1]:
// imports
%use dataframe
%use lets-plot

## Data Loading

In [2]:
val df = DataFrame.read("../data/raw/housing.csv")

In [3]:
df.head(5)

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Dataset Overview

In [4]:
df.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
longitude,Double,20640,844,0,-118.310000,162,-119.569704,2.003532,-124.350000,-118.490000,-114.310000
latitude,Double,20640,862,0,34.060000,244,35.631861,2.135952,32.540000,34.260000,41.950000
housing_median_age,Double,20640,52,0,52.000000,1273,28.639486,12.585558,1.000000,29.000000,52.000000
total_rooms,Double,20640,5926,0,1527.000000,18,2635.763081,2181.615252,2.000000,2127.000000,39320.000000
total_bedrooms,Double?,20640,1924,207,280.000000,55,537.870553,421.38507,1.000000,435.000000,6445.000000
population,Double,20640,3888,0,891.000000,25,1425.476744,1132.462122,3.000000,1166.000000,35682.000000
households,Double,20640,1815,0,306.000000,57,499.53968,382.329753,1.000000,409.000000,6082.000000
median_income,Double,20640,12928,0,3.125000,49,3.870671,1.899822,0.499900,3.534800,15.000100
median_house_value,Double,20640,3842,0,500001.000000,965,206855.816909,115395.615874,14999.000000,179700.000000,500001.000000
ocean_proximity,String,20640,5,0,<1H OCEAN,9136,,,<1H OCEAN,INLAND,NEAR OCEAN


The dataset contains about 20K instances with the following features:

- Numerical:
    - longitude
    - latitude
    - housing_median_age
    - total_rooms
    - total_bedrooms
    - population
    - households
    - median_income
    - median_house_value

- Caterogical:
    -  ocean_proximity

Let's take a closer look at the values of `ocean_proximity`

In [5]:
df.valueCounts { ocean_proximity }

ocean_proximity,count
<1H OCEAN,9136
INLAND,6551
NEAR OCEAN,2658
NEAR BAY,2290
ISLAND,5


## Features Histogram

Let's plot a histogram of the available features to understand their distributions

In [6]:
val data = df.toMap()

In [7]:
val numericalCols = df.select{dropLast(1)}.columnNames()

In [8]:
val dfMelted = df.select {dropLast(1)} //remove the categorical column
.cast<Float>()
.gather {numericalCols[0]..numericalCols[numericalCols.size - 1]}
.into("key", "value")

In [9]:
for(col in numericalCols) {
    val p = letsPlot(data) {x=col}   + ggsize(500, 250)
    (p +  geomHistogram(bins=50)).show()
}

## Correlations

In [12]:
val corrMatrix = df.corr()
corrMatrix

column,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.857126,0.918484,0.19805,0.134153
population,0.099773,-0.108785,-0.296244,0.857126,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,-0.02465,0.065843,0.688075,1.0


In [26]:
corrMatrix.select {column and median_house_value}.sortBy {-median_house_value}

column,median_house_value
median_house_value,1.0
median_income,0.688075
total_rooms,0.134153
housing_median_age,0.105623
households,0.065843
population,-0.02465
longitude,-0.045967
latitude,-0.14416
