## Smoothing

Smoothing can help to discover trends that otherwise might be hard to see in raw data. 

In [25]:
%use kandy(0.5.0-rc-1)
%use dataframe(0.12.0)
@file:Repository("https://packages.jetbrains.team/maven/p/kds/kotlin-ds-maven")
@file:DependsOn("org.jetbrains.kotlinx:kotlin-statistics-jvm:0.0.2")



In [26]:
var mpgDF = DataFrame.readCSV("https://raw.githubusercontent.com/JetBrains/lets-plot-kotlin/master/docs/examples/data/mpg.csv")
mpg_df.head()

### Linear model

In [31]:
mpgDF.plot {
    points { 
        x(displ)
        y(hwy)
    }
    smoothLine(displ, hwy, method = SmoothMethod.Linear()) {
        color = Color.hex("#c93b6b")
    }
}

### `LOESS` model (default)

In [8]:
mpg_plot + geomPoint() + statSmooth(method="loess", size=1.0)

In [32]:
mpgDF.plot {
    points { 
        x(displ)
        y(hwy)
    }
    smoothLine(displ, hwy, method = SmoothMethod.LOESS()) { 
        color = Color.hex("#c93b6b")
    }
}

### Applying smoothing to groups

Let's map the vehicle `drivetrain type` (variable 'drv') to the color of points.

This makes it easy to see that points with the same type of the drivetrain are forming some kind of groups or clusters. 

In [33]:
val groupedDF = mpgDF.groupBy { drv }

In [35]:
groupedDF.plot {
    points { 
        x(displ)
        y(hwy)
        color(drv)
    }
    smoothLine(displ, hwy, method = SmoothMethod.LOESS()) { 
        color(key.drv)
    }
}

### Apply linear model with 2nd degree polynomial.

As `LOESS` prediction looks a bit weird let's try 2nd degree polinomial regression.

In [36]:
groupedDF.plot {
    points { 
        x(displ)
        y(hwy)
        color(drv)
    }
    smoothLine(displ, hwy, method = SmoothMethod.Polynomial(2)) { 
        color(key.drv)
    }
}

## Effect of `span` parameter on the "wiggliness" the LOESS smoother.

The span is the fraction of points used to fit each local regression.
Small numbers make a wigglier curve, larger numbers make a smoother curve.

In [37]:
import kotlin.math.PI
import kotlin.random.Random

In [51]:
val n = 150
val x_range = generateSequence( -2 * PI ) { it + 4 * PI / n }.takeWhile { it <= 2 * PI }
val y_range = x_range.map{ sin( it ) + Random.nextDouble(-0.5, 0.5) }
val df = dataFrameOf(
    "x" to x_range.toList(),
    "y" to y_range.toList()
)

In [22]:
val p = ggplot(df) {x="x"; y="y"} + geomPoint(shape=21, fill="yellow", color="#8c564b")
val p1 = p + geomSmooth(method="loess", size=1.5, color="#d62728") + ggtitle("default (span = 0.5)")
val p2 = p + geomSmooth(method="loess", span=.2, size=1.5, color="#9467bd") + ggtitle("span = 0.2")
val p3 = p + geomSmooth(method="loess", span=.7, size=1.5, color="#1f77b4") + ggtitle("span = 0.7")
val p4 = p + geomSmooth(method="loess", span=1, size=1.5, color="#2ca02c") + ggtitle("span = 1")

GGBunch()
.addPlot(p1, 0, 0, 400, 300)
.addPlot(p2, 400, 0, 400, 300)
.addPlot(p3, 0, 300, 400, 300)
.addPlot(p4, 400, 300, 400, 300)

In [52]:
import org.jetbrains.kotlinx.kandy.dsl.internal.DataFramePlotContext

fun DataFramePlotContext<*>.pointsAndSmoothLine(span: Double){
    points { 
        x("x")
        y("y")
        symbol = Symbol.CIRCLE_FILLED
        fillColor = Color.YELLOW
        color = Color.hex("#8c564b")
    }
    smoothLine("x", "y", method = SmoothMethod.LOESS(span=span)) {
        color = Color.hex("#d62728")
        width = 1.5
    }
}

In [55]:
val pA = df.plot {
    pointsAndSmoothLine(0.5)
    layout.title = "span = 0.5 (default)"
}

val pB = plot(df) {
    pointsAndSmoothLine(0.2)
    layout.title = "span = 0.2"
}
val pC = plot(df) {
    pointsAndSmoothLine(0.7)
    layout.title = "span = 0.7"
}
val pD = plot(df) {
    pointsAndSmoothLine(1.0)
    layout.title = "span = 1"
}
plotGrid(listOf(pA, pB, pC, pD), nCol = 2)