In [6]:
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.headless = True
chrome = webdriver.Chrome(
    options = chrome_options
)

page = chrome.get("https://www.zillow.com/stanford-ca/sold/")

In [22]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.zillow.com/homedetails/19506780_zpid/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

links = [a['href'] for a in soup.find_all('a', class_='list-card-link')]
ids = [l.split('/')[-2].split('-')[0] for l in links]

In [26]:
result = {}

# Find the 'ds-home-details-chip' div
details_div = soup.find('div', 'ds-home-details-chip')

if details_div:
    p_tag = details_div.find('p')
    if p_tag:
        sold_items = [a.text for a in p_tag.find_all('span')]
        
        # Iterate through the extracted items and search for desired details
        for item in sold_items:
            if 'Sold:' in item:
                result['Sold Price'] = item.split(' ')[1]
            if 'Sold on' in item:
                result['Sold On'] = item.split(' ')[-1]
else:
    print("Could not find the 'ds-home-details-chip' div.")

print(result)

Could not find the 'ds-home-details-chip' div.
{}


In [29]:
import requests
import re

# Fetching the content of the Zillow page
url = 'https://www.zillow.com/homedetails/2626-Iron-St-Bellingham-WA-98225/23624938_zpid/'  # Replace with the actual URL
response = requests.get(url)
html = response.text

# Extracting the image IDs using regex
p = r'https:\\/\\/photos.zillowstatic.com\\/fp\\/(\d\w\-\_]+).jpg'
ids = [a.split('-')[0] for a in re.findall(p, html)]

# Constructing the image URLs
urls = [f'https://photos.zillowstatic.com/fp/{id}-uncropped_scaled_within_1536_1152.jpg' for id in ids]


In [33]:
print(html)

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="description" content="px-captcha">
    <title>Access to this page has been denied</title>
    
</head>
<body>
<script>
    /* PerimeterX assignments */
    window._pxVid = '';
    window._pxUuid = 'faf9548e-5c20-11ee-a3b0-4695041729ec';
    window._pxAppId = 'PXHYx10rg3';
    window._pxMobile = false;
    window._pxHostUrl = '/HYx10rg3/xhr';
    window._pxCustomLogo = 'https://www.zillowstatic.com/s3/pfs/static/z-logo-default.svg';
    window._pxJsClientSrc = '/HYx10rg3/init.js';
    window._pxFirstPartyEnabled = true;
    var pxCaptchaSrc = '/HYx10rg3/captcha/captcha.js?a=c&u=faf9548e-5c20-11ee-a3b0-4695041729ec&v=&m=0';

    var script = document.createElement('script');
    script.src = pxCaptchaSrc;
    script.onerror = function () {
        script = document.createElement('script');
        script.src = 'https://captcha.px-clou

In [None]:
# rules to check if YouTube comments are spam or ham
def check_out(x):
    return SPAM if "check out" in x.lower() else ABSTAIN
def sentiment(x):
    return HAM if sentiment_polarity(x) > 0.9 else ABSTAIN
def short_comment(x):
    return HAM if len(x.split()) < 5 else ABSTAIN

## ways to get data
### scraping
### cloud-sourcing
### active learning: chooses an example whose prediction is most uncertain, and give it to the human moderator to label
### self-training: iteratively train models to label unlabaled data
### data programming

# EDA - Exploratory Data Aanalsis

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
display.set_matplotlib_formats('svg')

  display.set_matplotlib_formats('svg')


In [None]:
data = pd.read_csv('house_sale.zip')
data.shape
data.head()

In [None]:
null_sum = data.isnull().sum()
data.columns[null_sum < len(data) * 0.3]

In [None]:
data.drop(columns=data.columns[null_sum > len(data) * 0.3], inplace=True)

In [None]:
data.dtypes

In [None]:
currency = ['Sold Price', 'Listed Price', 'Tax assessed value', 'Annual tax amount']
for c in currency:
    data[c] = data[c].replace(
        r'[$,-]', '', regex=True  # dollar symbol and - normally means no data, change to empty
    ).replace(  
        r'^\s*$', np.nan, regex=True  # empty string
    ).astype(float)  

In [None]:
areas = ['Total interior livable area', 'Lot size']
for c in areas:
    acres = data[c].str.contains('Acres') == True
    col = data[c].replace(  
        r'\b sqrt\b|\b Acres\b|\b,\b', '', regex=True
    ).astype(float)  
    col[acres] *= 43560
    data[c] = col

In [None]:
data.describe()

In [None]:
abnormal = (data[areas[1]] < 10) | (data[areas[1]] > 1e4)
data = data[~abnormal]
sum(abnormal)

In [None]:
ax = sns.histplot(np.log10(data['Sold Price']))
ax.set_xlim([3, 8])
ax.set_xticks(range(3, 9))
ax.set_xticklabels(['%.0e'%a for a in 10**ax.get_xticks()])

In [None]:
data['Type'].value_counts()[0:20] # get the unique value and corresponding count within one feature

In [None]:
types = data['Type'].isin(['SingleFamily', 'Condo', 'MultiFamily', 'Townhouse'])
sns.displot(pd.DataFrame({'Sold Price': np.log10(data[types]['Sold Price']),
                          'Type': data[types]['Type']}),
                          x='Sold Price', hue='Type', kind='kde' # density
                          )

In [None]:
data['Price per living sqft'] = data['Sold Price'] / data['Total interior livable area']
ax = sns.boxplot(x='Type', y='Price per living sqft', data=data[types], filtersize=0)
ax.set_ylim([0, 2000])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

In [None]:
_, ax = plt.subplots(figsize=(6,6))
columns = ['Sold Price', 'Listed Price', 'Annual tax amount', 'Price per living sqft', 'Elementary School Score', 'High School Score']
sns.heatmap(data[columns].corr(), annot=True, cmap='RdYlGn', ax=ax)

# Data Cleaning

In [None]:
data['Type'].value_counts()[0:20]

## tabular data
## Normalization for read value columns
### 1. Min-max normalization: linearly map to a new min a and max b
### xi` = (xi - min(x)) / (max(x) - min(x)) * (b - a) + a
### 2. Z-score normalization: 0 mean, 1 std
### xi` = (xi - mean(x)) / std(x)
### 3. Decimal scaling:
### xi` = xi / (10 ** j)
### 4. Log scaling:
### xi` = log(xi)

## Image Transformations
### Downsampling and cropping
### reduce image sizes: save storage and for fast loading
### jpeg will affect the image quality (80% acc)
### image whitening: make input less redundant, model converges faster

## Average video length
### movies - 2h, YouTube videos - 11 min, TikTok short videos - 15 sec
## Preprocessing to balance storage, quality and loading speed
## we often use short video clips (< 10 sec>): each clip contains a single event(i.e. human action)
## Decode a playable video, sample a sequence of frames
### best for loading, but 10x more space
### computation may be cheaper than storage (i.e. use GPU to decode)
### can apply other image transformation to the frames

## Text Transformation
## stemming and lemmatization: a word -> a common base form
### i.e. am, are, is -> be
## Tokenization: text -> a list of tokens (smallest unit to ML algo)
### by word: text.split(' ')
### by char: text.split('')
### by subwords: unigram, wordpiece, i.e. "a new gpu!" -> "a", "new", "gp", "##u", "!"


## feature engineering
## tabular data features
## int/float: directly use or bin to n unique int values
## categorical data: one-hot encoding, map rare categories into "unknown"
## data-time: a feature list such as [year, month, day, day_of_year, week_of_year, day_of_week]
## feature combination: cartesian product of two feature
### [cat, dog] * [male, female] -> 
### [(cat, male), (cat, female), (dog, male), (dog, female)]

## text features
## represent text as token features
## bag of words (BoW) model
### i.e. dog and cat and dinosaur:  fish, cat, and, dog, unknown -> [0, 1, 2, 1, 1]
### limitations: needs careful vocabulary design, missing context
## word embeddings (e.g. Word2vec):
### vectorizing words such that similar words are placed close together
### trained by predicting target word from context words
## pre-trained language models(e.g. BERT, GPT-3)
### giant transformer models
### trained with large amount of unannotated data
### fine-tuning for downstream tasks

## image/video features
### traditionally extract images by hand-craft features such as SIFT
### now commonly use pre-trained deep neural networks
### resnet: trained with imagenet (image classification)
### I3D: trained with Kinetics (action classification)

## challenges
### trade-off between label quality vs data volume
### data quality: 
### diversity: all relevant aspects are represented
### unbiased: no biased on a particular side
### faireness: non discriminating treatment of data and people
## large-scale data management: storage, process, version, security

## decision trees
### pros: explainable, can handle both numerical and categorical features
### cons: very non-robust (ensemble to help), complex trees cause overfitting (prune trees), not easy to be parallelized in computing

## random forest
### train multiple decision trees to improve robustness
### each tree is trained independently
### majority voting for classfication, average for regression
## where is the randomness from?
### bagging: randomly sample training examples with replacement i.e. [1,2,3,4,5] -> [1,2,2,3,4]
### randomly select a subset of features

## Gradient Boosting Decision Trees
### train multiple trees sequentially on residuals of error (loss function)

## Linear Regression
### softmax regression to solve classification problem
### 

In [None]:
0_exp = torch.exp(0)
partition = 0_exp.sum(1, keepdim=True)
Y = 0_exp / partition

## Mini-batch Stochastic Gradient Descent (SGD)
### w model param, b batch size, nt learning rate at time t
### randomly initialized w1
### reprat t = 1,2,... until converge
#### randomly samples It <- 1,2,...,n with |I| = b
#### update Wt+1 = Wt - nt * gradient(wt)
### Pros: solve all objectives in this course except for trees
### Cons: sensitive to hyper-parameters b and nt

## Code
### train a linear regression model with min-batch SGD
## Hyperparameters 
### batch_size, learning_rate, num_epochs

`features ` shape is (n, p), 'labels' shape is (p, 1)
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)  # read examples at random
    for i in range(0, number_examples, batch_size):
        batch_indices = torch.tensor(
            indices[i:min(i + batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]

w = torch.normal(0, 0.01, size=(p, 1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, features, labels)
        y_hat = X @ w + b
        loss = ((y_hat - y)**2 / 2).mean
        loss.backward()  # get the gredient of loss
        for param in [w, b]:  # update the w and b
            param -= learning_rate * param.grad 
            param.grad.zero()  # reset the gradient to zero, so that we can compute the gradient again in the next epoch


# Neural Network
## NN usually requires more data and more computation
## NN architectures to model data structures
### Multilayer perception (MLP), Convolutional Neural Networks, Recurrent Neural Networks, Transformer

## MLP
## A dense (fully connected, or linear) layer has parameters W <- R(m*n), b <- R(m), it computes output y = Wx + b <- R(m)
### Linear Regression: dense layer with 1 output
### Softmax regression: dense layer with m outputs + softmax

## Activation is a elemental-wise non-linear function
### sigmoid(x) = 1 / (1 + exp(-x))
### ReLU(x) = max(x, 0)
### it leads to non-linear models

## Stack multiple hidden layers
### (dense + activation) to get deeper models

## Hyper-parameters
### # hidden layers
### # outputs for each hidder layer

Code

MLP with 1 hidder layer
Hyperparameter: num_hiddens

def relu(x):
    return torch.max(X, 0)   # for each element, get the max of (element, 0)

W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens) * 0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens))
W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs) * 0.01)
b2 = nn.Parameter(torch.zeros(num_outputs))

H = relu(X @ W1 + b1)  # hidden layer output
Y = H @ W2 + b2

# Convolution layer
## Learn ImageNet (300*300 images with 1K classes) by a MLP with a single hidden layer with 10K outputs
### it leads to 1 billion learnable parameters, that's too big!
### fully connected: an output is a weighted sum over all intputs
## recognize objects in images
### translation invariance: similar output no matter where the object is
### locality: pixels are more related to near neighbors
## build the prior knowledge into the model structure
### achieve same model capacity with less # params 

### Locality: an output is computed from k*k input windows
### Translation invariant: outputs use the same k*k weights(kernel)
### number of model params of a conv layer does not depend on input/output sizes
### A kernel may learn to identify a pattern

## Code
### Convolution with single input and output channels

### both input 'X' and weight 'K' are matrices
h, w = K.shape
Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
for i in range(Y.shape[0]):
    for j in range(Y.shape[1]):
        Y[i, j] = (X[i:i + h, j:j + w] * k).sum()
        # X[i:i+h] here it is called cross-correlation

## Pooling Layer
### Convolution is sensitive to location
### A pixel shift in the input results in a pixel shift in output
### A pooling layer computes mean/max in k*k windows

# h, w: pooling window height and weight
# mode: max or avg
Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
for i in range(Y.shape[0]):
    for j in range(Y.shape[1]):
        if mode == 'max':
            Y[i, j] = (X[i:i + h, j:j + w] * k).max()
        if mode == 'avg':
            Y[i, j] = (X[i:i + h, j:j + w] * k).mean()

# Convolutional Neural Networks (CNN)
## A neural network uses stack of convolution layers to extract features
### Activation is applied after each convolution layer, to add some non-linear transformation
### Using pooling to reduce location sensitivity, adaptable to tiny position change or noises
## Modern CNNs are deep neural network with various hyper-parameters and layer connections (AlexNet, VGG, Inceptions, ResNet, MobileNet) 

i.e. Conv -> Pooling -> Conv -> Pooling -> Dense -> Outputs

lenet = nn.sequential(
nn.Conv2d(...),
nn.Sigmoid(),
nn.AvgPool2d(...),
nn.Conv2d(),
nn.Sigmoid(),
nn.AvgPool2d(...),
nn.Flatten(),
nn.Linear(...),
nn.Sigmoid(),
nn.Linear(...),
nn.Sigmoid(),
nn.Linear(...)
)

# Dense Layer -> Recurrent networks

## Language model: predict the next word
### hello -> world;   hello world -> !
## Use MLP naively doesn't handle sequence info well:

# RNN and Gated RNN
## Simple RNN: h(t) = sigma * (W(hh) * h(t-1) + W(hx) * x(t) + b(h))
## Gated RNN (LSTM, GRU): finer control of information flow
### forget input: suppress X(t) when computing h(t)
### forget past: suppress h(t-1) when computing h(t)
### when should we forget input or past? we will assign a different weights to computes it

Code:

## Implement Simple RNN
 
W_xh = nn.Parameter(torch.randn(num_inputs, num_hiddens) * 0.01) 
W_hh = nn.Parameter(torch.randn(num_inputs, num_hiddens) * 0.01)
b_h = nn.Parameter(torch.zeros(num_hiddens))

H = torch.zeros(num_hiddens)  # at the time zero, the history information is null
outputs = []

for X in inputs:  # 'inputs' shape : (num_steps, batch_size, num_inputs)
    H = torch.tanh(X @ W_xh + H @ W_hh + b_h)
    outputs.append(H)


# Bi-RNN and Deep RNN
we can look at a sentence from 2 directions, from right to left, from left to right

# Deep RNN
t -> RNN -> RNN -> RNN -> outputs
each RNN layer can be bi-directional or LSTM etc.

# Tabular -> Trees/Linear/MLP
# Text/Speech -> RNNS/Transformers
# Images/Audio/Video -> Transformers/CNNs

# Summary
## MLP: stack dense layers with non-linear activations
## CNN: stack convolution activation and pooling layers to efficient extract spatial information
## RNN: stack recurrent layers to pass temporal information through hidden state

# Reduce Bias & Variance

## Model generalization error: bias, variance and intrinsic error

## Reduce Bias: A more complex model, i.e. increase # layers, # hidden units in neural network
### Boosting, Stacking
## Reduce Variance: A simpler model, Regularization, i.e. L1,L2 regularizations
### Bagging, Stacking
## Reduce sigma^2: Improve data

## Ensemble Learning: use multiple models to improve predictive performance

# Bagging - Bootstrap Aggrgrating

## Bagging trains n base learners in parallel
## Make decisions by averaging learners' outputs (regression) or majority voting (classification)
## Each learner is trained on data by bootstrap sampling
### Assume m training examples, then randomly sampling m examples with replacement (repeative picking)
### Around 1 - 1/e = 63% examples will be sampled, the rest of bag can be used for validation


code: 

class Bagging:
    def __init__(self, base_learner, n_learners)：
        self.learners = [clone(base_learner) for _ in range(n_learners)]

    def fit(self, X, y):
        for learner in self.learners:
            examples = np.random.choice(
                np.arange(len(X)), int(len(X)), replace = True
            )
            learner.fit(X.iloc[examples, :], y.iloc[examples])

    def predict(self, X):
        preds = [learner.predict(X) for learner in self.learners]
        return np.array(preds).mean(axis=0)

# Random Forest
## Use decision tree as the base learner
## Often randomly select a subset of features for each learner

# Unstable Learners
## Bagging reduces variance, especially for unstable learners
## Bagging reduces variance more, when base learners are more unstable
## Decision tree is unstable, linear regression is stable

# Boosting
## Boosting combines weak learners into a strong one
### Primarily to reduce bias
## Learn n weak learners sequentially, at step i:
### Train a weak learner h(i), evaluate its errors epsilon(t)
### re-sample data according to epsilon(t) to focus on wrongly predicted examples
## Notable examples include AdaBoost, gradient boosting

# Gradient Boosting
## Denote by H(t)(x) the model at time t, with H(1)(x) = 0
## At step t = 1,2,...
### Train a new model h(t) on residuals: {(x(i), y(i) - H(t)(x(i)))}, i = 1,2,...
### H(t+1)(x) = H(t)(x) + aita * H(t)(x)
#### The learning rate aita regularizes the model by shrinkage
## The residuals equal to -dL/dH if using MSE as the loss
### Other boosting algo (i.e. AdaBoost) can also be gradient descent in the function space

Code:

class GradientBoosting:
    def __init__(self, base_learner, n_learners, learning_rate)：
        self.learners = [clone(base_learner) for _ in range(n_learners)]
        self.lr = learning_rate

    def fit(self, X, y):
        residual = y.copy()
        for learner in self.learners:
            learner.fit(X, residual)
            residual -= self.lr * learner.predict(X)

    def predict(self, X):
        preds = [learner.predict(X) for learner in self.learners]
        return np.array(preds).sum(axis=0) * self.lr

# Gradient Boosting Decision Trees (GBDT)
## Use decision tree as the week learner
### Regularize by a small max_depth and randomly sampling features
## Sequentially constructing trees runs slow
### Popular libs use accelerated algo, i.e. XGBoost, lightGBM, these 2 algos only run faster


# Summary
## Boosting combines weak learners into a strong one to reduce bias
## Gradient boosting learns weak learners by fitting the residuals


# Stacking 
## Combine multiple base learners to reduce variance
### Base learners can be different model types 
### Linearly combine base learners outputs by learned parameters
## Widely used in competitions
## In comparison, bagging
### uses same type models, uses bootstrap to get diversity

# Multi-layer Stacking 
## Stacking base learners in multiple levels to reduce bias
### can use a different set of base learners at each level
## Upper levels (i.e. L2) are trained on the outputs of the below level (i.e. L1)
### Concatenating original inputs helps
## But multi-layer stacking very easy leads to overfitting

# Overfitting in Multi-layer Stacking
## Train learners from different levels on different data to alleviate overfitting 
### Split training data into A and B, train L1 learners on A, predict on B to generate inputs to L2 learners
## Repeated k-fold bagging:
### Train k models as in k-fold cross validation
### Combine predictions of each model on out-of-fold data
### Repeat step 1,2 by n times, average the n predictions of each example for the next level training

# Manual Hyperparameter Tuning
## Start with a good baseline, e.g. default settings in high-quality toolkits, values reported in papers
## Tune a value, retrain the model to see the changes
## Repeat multiple times to gain insights about
### Which hyperparameters are important
### How sensitive the model to hyperparameters
### What are the good ranges

# HPO algorithms
## Hyperparameter Optimization
### Black box -> Grid Search, Random Search, Bayesian Optimization, Simulated Annealing, Genetic Algorithms
### Multi-fidelity Optimization -> Modeling Learning Curve, Bandit Based -> Successive Having, Hyper-

## Grid Search

for config in search_space:
    train_and_eval(config)
return best_result

### All combinations are evaluated
### Guarantees the best result
### Curse of dimensionality

## Random Search (the most common way)

for _ in range(n):
    config = random_select(search_space)
    train_and_eval(config)
return best_result

### Random combinations are tried
### More efficient than grid search

## Bayesian Optimization (BO)
### BO: Iteratively learn a mapping from HP to objective function. Based on previous trials. Select the next trial based on the current estimation.
### Surrogate model: Estimate how the objective function depends on HP. Probabilistic regression models: Random Forest, Gaussian Process.


## Acquisition function
### Acquisition max means uncertainty and predicted objective are high.
### Sample the next trial according to the acquisition function
### Trade off exploration and exploitation
## Limitation of BO:
### In the initial stages, similar to random search
### Optimization process is sequential

## Successive Having (SH)
### Save the budget for most promising config
### Randomly pick n configurations to train m epochs
### Repeat until one configuration left:
    keep the best n/2 configuration to train another m epochs
    keep the best n/4 configuration to train another 2m epochs
### select n and m based on training budget and # epoch needed for a full training

## Hyperband (more suitable for deep neural network, because we can select a sample from the data to run it)
### In successive Halving
    n : exploration
    m : exploitation
### Hyperband runs multiple Successive Halving, each time decreases n and increases m
    more exploration first, then do more exploit

# NAS algorithms - Neural Architecture Search
## NAS automates the design of neural network
### how to specify the search space of NN
### how to explore the search space
### performance estimation


## The one-shot approach
### Combines the learning of architecture and model params
### Construct and train a single model presents a wide variety of architectures
### Evaluate candidate architectures
    only care about the candidate ranking
    use a proxy metric: the accuracy after a few epochs
### re-train the most promising candidate from scratch

# Batch Normalization
## Standardizing data makes the loss smother for linear methods
### smooth: ||df(x) - df(y)||^2 <= beta*||x-y||^2
### a smaller beta allows a larger learning rate
### does not help deep NN
## Batch Normalization (BN) standards inputs for internal layer
### improves the smoothness to make training easier
### (still controversial why BN works)

### Reshape input X into 2D (no change for 2D input X -> R(n*p))
    x -> R(n*c*w*h) -> x' -> R(nwh*c) (batch n, channel c, width w, height h)
### Normalize by standardization each column xj', j = 1,...,n
    xj'(hat) <- (xj' - mean(xj')) / std(xj')
### Recovery Y' with yj' = gammaj * xj(hat) + betaj as the j-th column, gammaj, betaj params
### Output Y by reshaping Y' to the same shape as before

code:

def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not torch.is_grad_enabled(): # in prediction mode
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:  # nlp input
            # get the mean for the dim 0
            mean = X.mean(dim=0)  
            var = ((X - mean)**2).mean(dim=0)
        else:
            # (1, c, 1, 1), get the mean for the dim (0, 2, 3), the dim 1 are the same as before
            mean = X.mean(dim=(0, 2, 3), keepdim=True)  # convolution layer input 
            var = ((X - mean)**2).mean(dim=(0, 2, 3), keepdim=True)
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean  # moving smooth average
        moving_var = momentum * moving_var + (1.0 - momentum) * var  # # moving smooth var
    Y = gamma * X_hat + beta
    return Y, moving_mean, moving_var

# Layer Normalization
## if apply to RNN, BN needs maintain separated moving statistics for each time step
### problematic for very long sequences during inference
## Layer normalization reshapes input X -> R(n*p) -> X' -> R(p*n)
    X -> R(n*c*w*h) -> X' -> R(cwh*n), rest is same with BN
### Normalizing within each example, up to current time step
### Consistent between training and inference
### Popularized by Transformers 

# More Normalizations
## Modify "reshape", e.g.
### InstanceNorm: n*c*w*h -> wh*cn
### GroupNorm: n*c*w*h -> swh*gn with c = sg
### CrossNorm: swap mean/ std between a pair of features
## Modify "normalize": e.g. whitening
## Modify "recovery": e.g. replace gamma, beta with a dense layer
## Apply to weights or gradients

# Summary
## Normalizing inputs of internal layers makes deep NNs easier to train
## A normalization layer performs three steps: reshape input, normalize data, recovery with learnable params
### Notable examples include Batch Norm for CNNs, Layer Norm for Transformers

# Transfer Learning
## Motivation 
### Exploit a model trained on one task for a related task
### Popular in deep learning as DNNs are data hungry and training cost is high
## Approaches
### Feature extraction (e.g. Word2Vec, ResNet-50 feature, I3D feature)
### Train a model on a related task and reuse it
### Fine-tuning from a pretrained model (focus of this leacture)
## Related to 
### Semi-supervised learning
### In the extreme, zero-shot/ few-shot learning
### Multi-task learning, where some labeled data is available for each 

# Pre-trained Models
## Partition a neural network into:
### A feature extractor (encoder) maps raw pixels into linearly separable features
### A linear classifier (decoder) makes decisions
## Pre-trained model
### a neural network trained on a large-scale and general enough dataset 
### The feature extractor may generalize well to
    other datasets (e.g. medical/ satellite images)
    other tasks (e.g. object detection, segmentation)

# Fine-tuning techniques
## initialize the new model:
### initialize the feature extractor with the feature extractor params of a pre-trained model
### randomly initialize the output layer
### start the params optimization near a local minimal
## Train with a small learning rate with just a few epochs
### regularize the search space

# Freeze Bottom Layers
## Neural Networks learn hierarchical features
### Low-level features are universal, generalize well, e.g. curves/edges/blobs
### high-level features are more task and dataset specific, e.g. classfication labels
## freeze bottom layers during fine-tuning train the top layers from scratch
### keep Low-level universal features intact
### focus on learning task specific features
### a strong regularizer

# where to find pre-trained models
## tensorflow hub: https://tfhub.dev/
### tensorflow models submitted by users
## TIMM: https://github.com/rwightman/pytorch-image-models
### pytorch models collected by Ross Wightman

import timm
from torch import nn

model = timm.create_model('resnet18', pretrained=True)
# fc -> final layer -> fully connect layer
# n_classes is the classes for your task
mode.fc = nn.Linear(model.fc.in_features, n_classes)
# train model as a normal training job

# where to find pre-trained nlp models
## huggingface: a collection of pre-trained transformers models on both pytorch and tensorflow

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# padding here defines how long the sentence we want for our task
inputs = tokenizer(sentences, padding="max_length", truncated=True)
# num_labels means we want to do a classification task with 2 classes
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# train model on inputs as a normal training 