00-NumDNN-Introduction.tex

\documentclass[12pt,fleqn]{beamer}

\input{beamerStyle.tex}
\input{abbrv.tex}

\title[Intro]{Introduction}
\subtitle{Numerical Methods for Deep Learning}
\date{}

\begin{document}


\makebeamertitle

\section{Introduction to (Deep) Neural Networks} % (fold)
\label{sec:introduction_to_deep_neural_networks}


\begin{frame}\frametitle{Learning From Data: The Core of Science - 1}


Given inputs and outputs, how to choose $f$?

\bigskip


\textbf{Option 1} (Fundamental(?) understanding): For example, Galileo's law of motion
$$ x(t) = \hf g t^2, $$
with unknown parameter $g$. 

\pause

To estimate $g$ observe falling object
\begin{center}
\begin{tabular}{cc}
t   &  x \\ \hline
0  &  0  \\
1  & 4.9  \\
2  & 20.1 \\
3  & 44.1 \\
\end{tabular}
\end{center}

\bigskip

Goal: Derive model from theory, calibrate it using data.

\end{frame}


\begin{frame}\frametitle{Learning From Data: The Core of Science - 2}


Given inputs and outputs, how to choose $f$?

\bigskip

\textbf{Option 2} (Phenomenological models): For example, Archie's law - what is the electrical resistivity of a rock
 and how it relates to its porosity, $\phi$ and saturation, $S_w$?
$$ \rho(\phi,S_w) = a \phi^{n/2} S_w^p $$
$a,n,p$ unknown parameters


\bigskip

Obtaining parameters from observed data and lab experiments on rocks.

\bigskip

Goal: Find model that consistent with fundamental theory, without directly deriving it from theory.


\end{frame}


\begin{frame}\frametitle{Phenomenological vs. Fundamental}

\textbf{Fundamental laws} come from understanding(?) the underlying process.
They are {\bf assumed invariant} and can therefore be predictive(?).

\bigskip

\textbf{Phenomenological models} are data-driven. They ``work'' on some given data.
Hard to know what their limitations are.

\bigskip

{\bf But ...}
\begin{itemize}
\item models based on understanding can do poorly - weather, economics ...
\item models based on data can sometimes do better
\item how do we quantify understanding?
\end{itemize}

\end{frame}

\begin{frame}\frametitle{Machine Learning in 3 slides - 1}

{\em Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead. (wiki)}

\bigskip\pause

Two common tasks in machine learning:
\begin{itemize}
\item given data, cluster it and detect patterns in it (unsupervised learning)
\item given data and labels, find a functional relation between them (supervised learning)
\end{itemize}


\end{frame}

\begin{frame}\frametitle{Machine Learning in 3 slides - 2}
	
\begin{center}
	\begin{tabular}{ccc}
		unsupervised & semi-supervised \\
		\includegraphics[width=0.4\textwidth]{unsupervised_data}&
		\includegraphics[width=0.4\textwidth]{unsupervised_semi}		
	\end{tabular}
\end{center}

Unsupervised learning - given the data set $\bfY = [\bfy_1,\ldots,\bfy_n]$
cluster the data into "similar" groups (labels).
 
\bigskip

\begin{itemize}
\item helps find hidden patterns
\item often explorative and open-ended
\end{itemize}

\bigskip

Semisupervised - label the data based on a few examples


\end{frame}

\begin{frame}\frametitle{Machine Learning in 3 slides - 3}
	
	\begin{center}
		\begin{tabular}{cc}
			training data & trained model\\
			\includegraphics[width=0.4\textwidth]{PeaksStable-train}
			&
			\includegraphics[width=0.4\textwidth]{PeaksStable-train-cont}
		\end{tabular}
	\end{center}
	

Supervised learning - given the data set $\bfY = [\bfy_1,\ldots,\bfy_n] \in {\cal Y}$
and their labels $\bfC = [\bfc_1,\ldots,\bfc_n] \in {\cal C}$, find the relation $f:{\cal Y}\rightarrow {\cal C}$
 
\bigskip

\begin{itemize}
\item models range in complexity
\item older models based on support vector machines (SVM) and kernel methods
\item recently, deep neural networks (DNNs) dominate
\end{itemize}
\end{frame}


\begin{frame}\frametitle{Deep Neural Networks: History}

\begin{itemize}
	\item Neural Networks with a particular (deep) architecture
    \item Exist for a long time (70's and even earlier)~\cite{Rosenblatt1958,Rumelhart1986,LeCun1990}
    \item Recent revolution - computational power and lots of data~\cite{bengio2009learning,RainaEtAl2009,lecun2015deep}
    \item Can perform very well when trained with lots of data
    \item Applications
    \begin{itemize}
    \item Image recognition~~\cite{hinton2012deep,KrizhevskySutskeverHinton2012,lecun2015deep}, segmentation, natural language processing~\cite{BordesEtAl2014,CollobertEtAl2011,  JeanEtAl2014}
    \end{itemize}


    \pause

\item A few recent news articles:

 \begin{itemize}
    \item
{Apple Is Bringing the AI Revolution to Your iPhone, WIRED 2016}
\item
{Why Deep Learning Is Suddenly Changing Your Life,  FORTUNE 2016}
\item Data Scientist: Sexiest Job of the 21st Century, Harvard Business Rev ’17
\end{itemize}

\end{itemize}
\end{frame}


\begin{frame}
	\frametitle{Learning Objective: Demystify Deep Learning}
	\begin{center}
		\includegraphics[width=.9\textwidth]{DarkSecret}
	\end{center}
	
	Learning objectives of this minicourse:
	\begin{itemize}
		\item look under the hood of some deep learning examples
		\item describe deep learning mathematically (see also~\cite{HighamHigham2018})
		\item expose numerical challenges / approaches to improve DL
	\end{itemize}
\end{frame}

\begin{frame}\frametitle{DNN - A Quick Overview - 1}

Neural networks are data interpolator/classifier when the underlying model
is unknown.

\bigskip

A generic way to write it is
$$ \bfc = f(\bfy,\bftheta). $$


\begin{itemize}
\item the function $f$ is the computational model
\item $\bfy\in\R^{n_f}$ is the input data (e.g., an image)
\item $\bfc\in\R^{n_c}$ is the output (e.g. class of the image)
\item $\bftheta\in\R^{n_p}$ are parameters of the model $f$
\end{itemize}

In supervised learning we have examples $\{(\bfy_j,\bfc_j) \ : \ j=1,\ldots,n\}$ and the goal
is to estimate or ``learn'' the parameters $\bftheta$.

\end{frame}

\begin{frame}
	\frametitle{DNN - A Quick Overview - 2}
	
	\begin{center}
		\begin{columns}
			\column{.4\textwidth}
				\only<1>{\includegraphics[width=\textwidth]{NNpic}}\only<beamer|2->{\includegraphics[width=\textwidth]{NNpic2}}
			\column{.6\textwidth}
			\invisible<beamer|1>{ $ \left\{ 
				\begin{array}{rcl}
					\bfy_{l+1} &=& \sigma( \bfK_l \bfy_l + \bfb_l) \\
					\bfy_{l+1} &=& \bfy_l + \sigma( \bfK_l \bfy_l + \bfb_l) \\	
					\bfy_{l+1} &=& \bfy_l + \sigma\left( \bfL_{l}\sigma( \bfK_{l} \bfy_l + \bfb_{l})\right) \\	
					&\vdots			&
				\end{array}
			\right.$
			
			}
		
		\end{columns}
	\end{center}
		
		\vspace{5mm}
		\invisible<beamer|1>{
		Here:
		\begin{itemize}
			\item $l=0,1,2,\ldots,N$ is the layer
			\item $\sigma : \R \to \R$ is the activation function
			\item $\bfy_0 = \bfy\in\R^{n_f}$ is the input data (e.g., an image)
			\item $\bfc\in\R^{n_c}$ is the output (e.g. class of the image)
			\item $\bfL_l, \bfK_l, \bfb_l$ are parameters of the model $f$
		\end{itemize}}
		\end{frame}

\begin{frame}\frametitle{{Machine Learning in 3 slides}}

{\em Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead. (wiki)}

\bigskip\pause

Two common tasks in machine learning:
\begin{itemize}
\item given data, cluster it and detect patterns in it (unsupervised learning)
\item given data and labels, find a functional relation between them (supervised learning)
\end{itemize}


\end{frame}

\begin{frame}\frametitle{{Machine Learning in 3 slides}}
	
\begin{center}
	\begin{tabular}{ccc}
		unsupervised & semi-supervised \\
		\includegraphics[width=0.4\textwidth]{unsupervised_data}&
		\includegraphics[width=0.4\textwidth]{unsupervised_semi}		
	\end{tabular}
\end{center}

Unsupervised learning - given the data set $\bfY = [\bfy_1,\ldots,\bfy_n]$
cluster the data into "similar" groups (labels).
 
\bigskip

\begin{itemize}
\item helps find hidden patterns
\item often explorative and open-ended
\end{itemize}

\bigskip

Semisupervised - label the data based on a few examples


\end{frame}

\begin{frame}\frametitle{{Machine Learning in 3 slides}}
	
	\begin{center}
		\begin{tabular}{cc}
			training data & trained model\\
			\includegraphics[width=0.4\textwidth]{PeaksStable-train}
			&
			\includegraphics[width=0.4\textwidth]{PeaksStable-train-cont}
			
		\end{tabular}
	\end{center}
	

Supervised learning - given the data set $\bfY = [\bfy_1,\ldots,\bfy_n] \in {\cal Y}$
and their labels $\bfC = [\bfc_1,\ldots,\bfc_n] \in {\cal C}$, find the relation $f:{\cal Y}\rightarrow {\cal C}$
 
\bigskip

\begin{itemize}
\item models range in complexity
\item older models based on support vector machines (SVM) and kernel methods
\item recently, deep neural networks (DNNs) dominate
\end{itemize}


\end{frame}

\begin{frame}\frametitle{Generalization - 1}

Suppose that we have examples $\{\bfy_j,\bfc_j\},\ \ j=1,\ldots,n$,
a model $f(\bfy,\bftheta)$ and some optimal parameter $\bftheta^*$.

Let $\{(\bfy^t_j,\bfc^t_j) \ : \  j=1,\ldots,s\}$ be some test set, that was not used
to compute $\bftheta^*$.

\bigskip
\pause

Loosely speaking, if
$$ \|f(\bfy^t_j,\bftheta^*) - \bfc_j^t\|_p \text{ is small}$$
then the model is predictive - it generalizes well


\pause
\bigskip


For phenomenological models, there is no reason why the model
should generalize, but in practice it often does.


\end{frame}

\begin{frame}\frametitle{Generalization - 2}

\begin{center}
	\begin{tabular}{cc}
		\invisible<beamer|1>{\includegraphics[width=.45\textwidth]{generalize_overfit}}
		&
		\invisible<beamer|-2>{\includegraphics[width=.45\textwidth]{generalize_underfit}}
	\end{tabular}
\end{center}

Why would a model generalize poorly?
$$ 1 \ll \|f(\bfy^t_j,\bftheta^*) - \bfc_j^t\|_p $$

\bigskip
\pause

Two common reasons:
\begin{enumerate}
\item Our ``optimal'' $\bftheta^*$ was optimal for the training but is less so for other data
\pause
\item The chosen computational model $f$ is poor (e.g. quadratic model for a nonlinear function).
\end{enumerate}


\end{frame} 


\begin{frame}\frametitle{Example: Classification of Hand-written Digits }

\begin{itemize}
\item
Let $\bfy_j \in \R^{n_f}$ and let $\bfc_j\in \R^{n_c}$.
\item
The vector $\bfc$ is the probability of $\bfy$ belonging to a certain class. Clearly, $0\le \bfc_j \le 1$ and $\sum_{j=1}^{n_c} \bfc_j = 1$.
\end{itemize}

\vfill

Examples (MNIST):
\begin{center}
	\begin{tabular}{cc}
		$\bfy_1$ & $\bfy_2$\\
	\includegraphics[width=30mm]{maybe4} &
	\includegraphics[width=30mm]{maybe7} \\
	$\bfc_1 = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]^\top$ &
	$\bfc_2 = [0, 0.3, 0, 0, 0, 0, 0, 0.7, 0, 0]^\top$
	\end{tabular}
	
\end{center}


\end{frame} 
\begin{frame}\frametitle{Example: Classification of Natural Images}

Image classification of natural images
\bigskip

Examples (CIFAR-10):
\begin{center}
\includegraphics[width=9.5cm]{cifar10Sample.jpg} 
\end{center}

\end{frame}
 
\begin{frame}\frametitle{Example:  Semantic Segmentation - 1}

\begin{itemize}
	\item
	let $\bfy_j \in \R^n$ be an RGB or grey valued image.
	\item let the pixels in $\bfc_j\in \{1,2,3,\ldots \}^k$ denote the labels.
\end{itemize}

\begin{center}
	\begin{tabular}{cc}
		$\bfy$, input image & $\bfc$, segmentation (labeled image)\\
	\includegraphics[width=5.5cm]{camvidPic.jpg} &
	\includegraphics[width=5.5cm]{camvidClass.jpg} 
	\end{tabular}

	Goal: Find map $\bfc  = f(\bfy,\bftheta)$ 

\end{center}


\end{frame}

\begin{frame}\frametitle{Example: Semantic Segmentation - 2}

Problem: Given image $\bfy$ and label $\bfc$, find a map $f(\cdot,\bftheta)$ such that $\bfc  \approx f(\bfy,\bftheta)$ 

\bigskip
\pause

First step: Reduce the dimensionality of problem.
\begin{itemize}
\item extract features from the image
\item classify in the feature space
\end{itemize}

Reduce the problem of learning from the image to feature
detection and classification

\pause
\bigskip

Possible features: Color, neighbors, edges ...

\bigskip


\end{frame}
\begin{frame}\frametitle{Example: Semantic Segmentation - 3}

Problem: Given image $\bfy$ and label $\bfc$ find a map $f(\cdot,\bftheta)$ such that $\bfc  \approx f(\bfy,\bftheta)$ 

\bigskip
\pause

First step: Reduce the dimensionality of problem.
\begin{itemize}
\item extract features from the image
\item classify in the feature space
\end{itemize}

Reduce the problem of learning from the image to feature
detection and classification

\pause
\bigskip

Possible features: Color, neighbors, edges ...

\bigskip


\end{frame}


\begin{frame}\frametitle{Example: Semantic Segmentation - 3}


Simpler setup
\begin{itemize}
\item input: $\bfy$ is the RGB value of the pixel (and its neighbors?)  
\item output: $\bfc$ is a labeled pixel
\item goal:  map $\bfc  = f(\bfy,\bftheta)$ 
\end{itemize}

\begin{tabular}{cc}
\includegraphics[width=5.5cm]{circles.jpg} &
\includegraphics[width=4.5cm]{circlesFeatSpace.jpg} \\
input image and segmentation & 3D representation of RGB values \\
\end{tabular}

\end{frame}


\begin{frame}[allowframebreaks]
	\frametitle{References}
 \bibliographystyle{abbrv}
\bibliography{NumDNN}

\end{frame}

\end{document}