In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
SEED = 1
%matplotlib inline
sns.set()

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#What-is-Machine-Learning" data-toc-modified-id="What-is-Machine-Learning-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>What is Machine Learning</a></span><ul class="toc-item"><li><span><a href="#Basic-Terms" data-toc-modified-id="Basic-Terms-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Basic Terms</a></span></li><li><span><a href="#Assessing-Performance" data-toc-modified-id="Assessing-Performance-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Assessing Performance</a></span></li></ul></li><li><span><a href="#Simple-Regression-Model" data-toc-modified-id="Simple-Regression-Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Simple Regression Model</a></span><ul class="toc-item"><li><span><a href="#Bias-vs-Variance" data-toc-modified-id="Bias-vs-Variance-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Bias vs Variance</a></span></li></ul></li><li><span><a href="#Break" data-toc-modified-id="Break-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Break</a></span></li><li><span><a href="#Natural-Language-Processing" data-toc-modified-id="Natural-Language-Processing-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Natural Language Processing</a></span></li></ul></div>

# Multidisciplinary Design Program

# Fundamentals of Machine Learning 

<br>
<br>
<br>

Jeremy Castagno

01/30/2019

## What is Machine Learning

* Statistical Learning, Pattern Recognition, Big Data, Data Mining, Expert Systems, Artificial Intelligence (AI), Deep Learning
* Definition- Algorithms and Statistical Models to perform a specific task
* Broadly speaking there are three categories of ML- **supervised** learning, **unsupervised** learning, and **reinforcement** learning
* In this class we will focus only on **supervised** and **unsupervised** learning
* In almost all cases there is an <font color='blue'>input</font> (x) and an <font color='green'>output</font> (y) for our system
* There is a true $f(x)$ which we seek to approximate with an $\hat{f}(x)$

**MDP Projects**

* Analyzes $\color{blue}{\text{audio inputs}}$, with a goal of determining factors such as $\color{green}{\text{occupants, locations, and state of vehicle}}$
* Process the $\ldots \color{blue}{\text{road itself}} \ldots$ to identify $\color{green}{\text{free paths or drivable surfaces}}$
* Recognize $\ldots \color{blue}{\text{email}} \ldots$ and <font color='green'>direct email to appropriate response functions [classification]</font> as well as $\color{green}{\text{suggesting a generated response}}$ 

### Basic Terms

* Regression vs Classification
  * Regression - Output takes continuous valued variables
  * Classification - Output determines group membership (class)
  * Sometimes the output is **both** - Semantic bounding boxes
* Features, predictors, independent variables - text email, audio stream, video stream
* Classes, labels, ground truth
* Deep learning and neural networks
  * A cascade of multiple layers processing units
  * Each layer(s) may learn different abstractions of the task

**Supervised Learning**

Learning a function that maps an *input* ($X$) to and *output* ($Y$)

$$
Y=f(X)+ \underbrace{\epsilon}_{error}
$$



$ \mathbf{X}=\overbrace{ \left( \begin{array}{cccc}{x_{11}} & {x_{12}} & {\dots} & {x_{1 p}} \\ {x_{21}} & {x_{22}} & {\dots} & {x_{2 p}} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {x_{n 1}} & {x_{n 2}} & {\dots} & {x_{n p}}\end{array}\right)}^{Features}$

$ \mathbf{Y}=\overbrace{\left( \begin{array}{c}{y_{1}} \\ {y_{2}} \\ {\vdots} \\ {y_{n}}\end{array}\right)}^{labels} $

* Main Questions:
    * Which predictors are associated with the response?
        * Should we preprocess our raw data into features? Text $\implies$ Vector
    * What is the relationship between the response and each predictor?
    * What model can be used to estimate f?
        * Parametric Functions vs Non Parametric Functions

* Share an example?
* How do we determine relationships?
  * Cross Covariance, Plotting
![](assets/covariance.png)


### Assessing Performance

* Training Set and Test Set - Split 60/40
    * Your model should never be trained with the test set
* Regression 
  * Mean Squared Error (MSE) = $ MSE=\frac{1}{n} \sum_{i=1}^{n}\left(y_{i}-\hat{f}\left(x_{i}\right)\right)^{2} $
* Classification
  * Training use Logarithmic Loss - $ \frac{-1}{N} \sum_{i=1}^{N} \sum_{j=1}^{M} y_{i j} * \log \left(p_{i j}\right) $
  * Final Assessment
      * Classification Accuracy = $\frac{\text{# of correct pred.}}{\text{Total # of pred.}}$ 
      * Confusion Matrix - True Positive, True Negative, False Positives, False Negatives


# ![](assets/Precisionrecall.png) 

# ![](assets/Precisionrecall2.png) 

* Many others for specific tasks
  * Image Bounding Boxes - Intersection of Union (IOU)
  * TODO

* k-Fold cross validation - Train model on subsets of your training data
    * Randomly divide data set into *k* folds of equal size.
    * The first fold is treated as a validation set, the remaining *k* - 1 data is trained on
    * repeat *k* times
    * This results in *k* estimates of validation error
    * $ \mathrm{CV}_{(k)}=\frac{1}{k} \sum_{i=1}^{k} \mathrm{MSE}_{i} $


  


## Simple Regression Model

Goals

* Understand bias/variance tradeoff
* How to prevent overfitting
  * More Data
 


![](assets/regression_plot.png)

Question: How would you fit this model?

### Bias vs Variance

$
Y=f(X)+ \underbrace{\epsilon}_{error}
$

$\operatorname{Err}(x)=E\left[(Y-\hat{f}(x))^{2}\right] $

$ \operatorname{Err}(x) = \underbrace{\big(E[\hat{f}(x)]-f(x)\big)^{2}}_{\text{Bias}^2}+ \underbrace{E\left[(\hat{f}(x)-E[\hat{f}(x)])^{2}\right]}_{\text{Variance}}+ \underbrace{\operatorname{Var}(\epsilon)}_{\text{Irreducible Error}} $

* Bias indicates a fundamental mismatch between the actual function, $f(x)$, and the estimated function $\hat{f}(x)$
   * Linear model trying to estimate a quadratic function
* Variance refers to the amount by which $\hat{f}$ would change if we estimated it using a different training data set
  * Each $\hat{f}$ would be the same fundamental model, but have different parameters associated with it.
  * In a perfect world these different estimates of $\hat{f}$ would vary very little. 
  * If a model has high variance, small changes in the training data will make big changes in $\hat{f}$
* $\operatorname{Var}(\epsilon)$ is noise in the data and can never be reduced

* More flexible the model then variance will increase and the bias will decrease

In [46]:
NUM_SAMPLES = 100; START = 0; STOP = 10
# noise = np.random.RandomState(seed=SEED)

def f(a=1, b=2, c=2, sin_m=20):
    poly = np.poly1d([c, b, a])
    def final(x):
        return poly(x) + sin_m * np.sin(x)
    return final

def evaluate_deg(x_train, y_train, x_test, y_test, deg=1):
    params = np.polyfit(x_train, y_train, deg=deg)
    poly = np.poly1d(params)
    y_train_pred = poly(x_train)
    y_test_pred = poly(x_test)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    return [deg, mse_train, mse_test, poly]

def plot_tradeoff(X, Y_noise, f_true, results, sigma):
    fig, ax = plt.subplots(figsize=(16, 8),nrows=1, ncols=2)
    ax[0].plot(X, Y_noise, '.', label='data')
    ax[0].plot(X, f_true(X), '-', label=r'$f(x) = 2x^2 + 2x + 1 + 20sin(x)$')
    
    degrees = [result[0] for result in results]
    mse_train_line = [result[1] for result in results]
    mse_test_line = [result[2] for result in results]
    ax[1].plot(degrees, mse_train_line, c='red', label='Train')
    ax[1].plot(degrees, mse_test_line, c='gray', label='Test')
    
    ax[1].axhline(sigma**2, linestyle='--', label=r'$\sigma^2$')
    for result in results:
        ax[0].plot(X, result[3](X), label="Deg {}".format(result[0]))
    ax[0].legend()
    ax[1].legend()

    ax[0].set_xlabel("X")
    ax[0].set_ylabel("Y")
    ax[1].set_xlabel("Degree")
    ax[1].set_ylabel("MSE")
    ax[1].set_ylim(0, 2000)

    

def set_up(mu=0, sigma=20, num_samples=NUM_SAMPLES):
    # Data
    noise = np.random.RandomState(seed=sigma)
    f_true = f()
    X = np.linspace(START, STOP, num_samples)
    Y = f_true(X)

    Y_noise = Y + noise.normal(mu, sigma, num_samples)

    x_train, x_test, y_train, y_test = train_test_split(X, Y_noise, test_size=0.50, random_state=SEED)
    degrees= [1,2, 5, 10]
    results = [evaluate_deg(x_train, y_train, x_test, y_test, deg=deg) for deg in degrees]
    
    return X, Y_noise, f_true, results

def plot_interact(mu=0, sigma=20, num_samples=NUM_SAMPLES):
    X, Y_noise, f_true, results = set_up(mu, sigma, num_samples)
    plot_tradeoff(X, Y_noise, f_true, results, sigma)



In [51]:
interact(plot_interact,  mu=fixed(0), sigma=widgets.IntSlider(min=0,max=30,step=5,value=20), 
         num_samples=widgets.IntSlider(min=100,max=500,step=200,value=100));

interactive(children=(IntSlider(value=20, description='sigma', max=30, step=5), IntSlider(value=100, descripti…

## Break

* Restroom
* Discuss Projects

![](assets/Oscillating_pendulum.gif)

## Natural Language Processing