In [1]:
__author__ = "Isaac Cheng"
__purpose__ = "XCS224U Project Literature Review"
__version__ = "03/03 2023"

## Contents
1. [Transformer](#Transformer)
1. [BERT](#BERT)
1. [All The Ways You Can Compress BERT](#All-The-Ways-You-Can-Compress-BERT)
1. [Literature Review](#Literature-Review)

## Transformer

This section is based on [The Annotated Transformer](https://nlp.seas.harvard.edu/annotated-transformer/).

In [None]:
import torch
import torch.nn as nn

In [52]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [53]:
x = torch.FloatTensor([[1, 2], [3, 7], [5, 6]])
normLayer = LayerNorm(features=2)
normLayer(x)

tensor([[-0.7071,  0.7071],
        [-0.7071,  0.7071],
        [-0.7071,  0.7071]], grad_fn=<AddBackward0>)

In [89]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [90]:
a = torch.randn(3, 3)
a.triu()

tensor([[ 0.0245, -0.6093, -0.5761],
        [ 0.0000,  0.4098, -2.7355],
        [ 0.0000,  0.0000, -0.7653]])

In [91]:
a

tensor([[ 0.0245, -0.6093, -0.5761],
        [ 0.7395,  0.4098, -2.7355],
        [-1.1971, -0.6291, -0.7653]])

In [92]:
torch.triu(a, diagonal=1)

tensor([[ 0.0000, -0.6093, -0.5761],
        [ 0.0000,  0.0000, -2.7355],
        [ 0.0000,  0.0000,  0.0000]])

In [93]:
a.reshape((-1, a.shape[0], a.shape[1]))

tensor([[[ 0.0245, -0.6093, -0.5761],
         [ 0.7395,  0.4098, -2.7355],
         [-1.1971, -0.6291, -0.7653]]])

In [94]:
torch.triu(a, diagonal=1)

tensor([[ 0.0000, -0.6093, -0.5761],
        [ 0.0000,  0.0000, -2.7355],
        [ 0.0000,  0.0000,  0.0000]])

In [95]:
import pandas as pd

In [96]:
x = 0
y = 0
LS_data = pd.concat(
    [
        pd.DataFrame(
            {
                "Subsequent Mask": ["test"],
                "Window": y,
                "Masking": x,
            }
        )
        for y in range(20)
        for x in range(20)
    ]
)
LS_data

Unnamed: 0,Subsequent Mask,Window,Masking
0,test,0,0
0,test,0,1
0,test,0,2
0,test,0,3
0,test,0,4
...,...,...,...
0,test,19,15
0,test,19,16
0,test,19,17
0,test,19,18


In [179]:
subsequent_mask(2)[0] == 0

tensor([[False,  True],
        [False, False]])

In [100]:
x = 1
y = 2
subsequent_mask(20)[0][x][y]

tensor(False)

In [101]:
subsequent_mask(20)[0][x][y].flatten()

tensor([False])

In [103]:
import altair as alt

In [109]:
def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)

In [173]:
def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(),
                    "Window": y,
                    "Masking": x,
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )


show_example(example_mask)

In [111]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [112]:
a = torch.rand(3, 4)

In [116]:
a.size(-1)

4

In [122]:
a

tensor([[0.0108, 0.3177, 0.7418, 0.5982],
        [0.5857, 0.7359, 0.4902, 0.4367],
        [0.8059, 0.4806, 0.8149, 0.3080]])

In [123]:
a.transpose(-2, -1)

tensor([[0.0108, 0.5857, 0.8059],
        [0.3177, 0.7359, 0.4806],
        [0.7418, 0.4902, 0.8149],
        [0.5982, 0.4367, 0.3080]])

In [124]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

In [126]:
d_model = 4
m = 3
linears = nn.Linear(d_model, d_model)

query = torch.rand((1, m, d_model))
print(query)

key = torch.rand((1, m, d_model))
print(key)

value = torch.rand((1, m, d_model))
print(value)

tensor([[[0.4390, 0.0707, 0.4652, 0.1999],
         [0.5032, 0.3814, 0.9141, 0.0198],
         [0.8172, 0.7814, 0.3147, 0.4379]]])
tensor([[[0.9709, 0.1316, 0.9062, 0.5892],
         [0.5745, 0.7466, 0.4297, 0.7534],
         [0.1304, 0.6124, 0.4037, 0.0398]]])
tensor([[[0.2198, 0.0372, 0.4074, 0.2016],
         [0.9907, 0.1577, 0.0249, 0.3091],
         [0.9093, 0.8080, 0.0172, 0.5201]]])


In [133]:
import copy

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [134]:
linears = clones(nn.Linear(d_model, d_model), 4)
for lin, x in zip(linears, (query, key, value)):
    print(lin)
    print(x)

Linear(in_features=4, out_features=4, bias=True)
tensor([[[0.4390, 0.0707, 0.4652, 0.1999],
         [0.5032, 0.3814, 0.9141, 0.0198],
         [0.8172, 0.7814, 0.3147, 0.4379]]])
Linear(in_features=4, out_features=4, bias=True)
tensor([[[0.9709, 0.1316, 0.9062, 0.5892],
         [0.5745, 0.7466, 0.4297, 0.7534],
         [0.1304, 0.6124, 0.4037, 0.0398]]])
Linear(in_features=4, out_features=4, bias=True)
tensor([[[0.2198, 0.0372, 0.4074, 0.2016],
         [0.9907, 0.1577, 0.0249, 0.3091],
         [0.9093, 0.8080, 0.0172, 0.5201]]])


In [135]:
lin = nn.Linear(d_model, d_model)

In [138]:
lin(query).size()

torch.Size([1, 3, 4])

In [141]:
h = 2
d_k = d_model // h
nbatches = query.size(0)
lin(x).view(nbatches, -1, h, d_k)

tensor([[[[ 0.6419,  0.7506],
          [-0.4159,  0.5317]],

         [[ 1.0187,  0.6749],
          [-0.5196,  0.6189]],

         [[ 1.3463,  0.6330],
          [-0.5291,  0.5539]]]], grad_fn=<ViewBackward0>)

In [142]:
lin(x).view(nbatches, -1, h, d_k).size()

torch.Size([1, 3, 2, 2])

In [143]:
lin(x).view(nbatches, -1, h, d_k).transpose(1, 2).size()

torch.Size([1, 2, 3, 2])

In [144]:
lin(x).view(nbatches, -1, h, d_k).transpose(1, 2)

tensor([[[[ 0.6419,  0.7506],
          [ 1.0187,  0.6749],
          [ 1.3463,  0.6330]],

         [[-0.4159,  0.5317],
          [-0.5196,  0.6189],
          [-0.5291,  0.5539]]]], grad_fn=<TransposeBackward0>)

In [None]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [153]:
max_len = 10
d_model = 40

pe = torch.zeros(max_len, d_model)
pe

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [147]:
torch.arange(0, max_len)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [148]:
torch.arange(0, max_len).size()

torch.Size([10])

In [150]:
torch.arange(0, max_len).unsqueeze(1).shape

torch.Size([10, 1])

In [154]:
import math

div_term = torch.exp(
    torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
)
div_term

tensor([1.0000e+00, 6.3096e-01, 3.9811e-01, 2.5119e-01, 1.5849e-01, 1.0000e-01,
        6.3096e-02, 3.9811e-02, 2.5119e-02, 1.5849e-02, 1.0000e-02, 6.3096e-03,
        3.9811e-03, 2.5119e-03, 1.5849e-03, 1.0000e-03, 6.3096e-04, 3.9811e-04,
        2.5119e-04, 1.5849e-04])

In [155]:
torch.arange(0, d_model, 2)

tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34,
        36, 38])

In [158]:
position = torch.arange(0, max_len).unsqueeze(1)
print(position.size())

torch.Size([10, 1])


In [159]:
position * torch.arange(0, d_model, 2)

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0],
        [  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,
          28,  30,  32,  34,  36,  38],
        [  0,   4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,
          56,  60,  64,  68,  72,  76],
        [  0,   6,  12,  18,  24,  30,  36,  42,  48,  54,  60,  66,  72,  78,
          84,  90,  96, 102, 108, 114],
        [  0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104,
         112, 120, 128, 136, 144, 152],
        [  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
         140, 150, 160, 170, 180, 190],
        [  0,  12,  24,  36,  48,  60,  72,  84,  96, 108, 120, 132, 144, 156,
         168, 180, 192, 204, 216, 228],
        [  0,  14,  28,  42,  56,  70,  84,  98, 112, 126, 140, 154, 168, 182,
         196, 210, 224, 238, 252, 266],
        [  0,  16,  32,  48,  64,  80,  96, 112,

In [160]:
position * div_term

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.0000e+00, 6.3096e-01, 3.9811e-01, 2.5119e-01, 1.5849e-01, 1.0000e-01,
         6.3096e-02, 3.9811e-02, 2.5119e-02, 1.5849e-02, 1.0000e-02, 6.3096e-03,
         3.9811e-03, 2.5119e-03, 1.5849e-03, 1.0000e-03, 6.3096e-04, 3.9811e-04,
         2.5119e-04, 1.5849e-04],
        [2.0000e+00, 1.2619e+00, 7.9621e-01, 5.0238e-01, 3.1698e-01, 2.0000e-01,
         1.2619e-01, 7.9621e-02, 5.0238e-02, 3.1698e-02, 2.0000e-02, 1.2619e-02,
         7.9621e-03, 5.0238e-03, 3.1698e-03, 2.0000e-03, 1.2619e-03, 7.9621e-04,
         5.0238e-04, 3.1698e-04],
        [3.0000e+00, 1.8929e+00, 1.1943e+00, 7.5357e-01, 4.7547e-01, 3.0000e-01,
         1.8929e-01, 1.1943e-01, 7.5357e-02, 4.7547e-02, 3.0000e-02, 1.8929e-02,
       

In [161]:
torch.sin(position * div_term)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  5.8992e-01,  3.8767e-01,  2.4856e-01,  1.5783e-01,
          9.9833e-02,  6.3054e-02,  3.9800e-02,  2.5116e-02,  1.5848e-02,
          9.9998e-03,  6.3095e-03,  3.9811e-03,  2.5119e-03,  1.5849e-03,
          1.0000e-03,  6.3096e-04,  3.9811e-04,  2.5119e-04,  1.5849e-04],
        [ 9.0930e-01,  9.5267e-01,  7.1471e-01,  4.8151e-01,  3.1170e-01,
          1.9867e-01,  1.2586e-01,  7.9537e-02,  5.0217e-02,  3.1693e-02,
          1.9999e-02,  1.2619e-02,  7.9621e-03,  5.0238e-03,  3.1698e-03,
          2.0000e-03,  1.2619e-03,  7.9621e-04,  5.0238e-04,  3.1698e-04],
        [ 1.4112e-01,  9.4858e-01,  9.2997e-01,  6.8424e-01,  4.5775e-01,
          2.9552e-01,  1.8816e-01, 

In [162]:
torch.sin(position * div_term).size()

torch.Size([10, 20])

In [163]:
pe

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [170]:
pe[:, 0::2]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  5.8992e-01,  3.8767e-01,  2.4856e-01,  1.5783e-01,
          9.9833e-02,  6.3054e-02,  3.9800e-02,  2.5116e-02,  1.5848e-02,
          9.9998e-03,  6.3095e-03,  3.9811e-03,  2.5119e-03,  1.5849e-03,
          1.0000e-03,  6.3096e-04,  3.9811e-04,  2.5119e-04,  1.5849e-04],
        [ 9.0930e-01,  9.5267e-01,  7.1471e-01,  4.8151e-01,  3.1170e-01,
          1.9867e-01,  1.2586e-01,  7.9537e-02,  5.0217e-02,  3.1693e-02,
          1.9999e-02,  1.2619e-02,  7.9621e-03,  5.0238e-03,  3.1698e-03,
          2.0000e-03,  1.2619e-03,  7.9621e-04,  5.0238e-04,  3.1698e-04],
        [ 1.4112e-01,  9.4858e-01,  9.2997e-01,  6.8424e-01,  4.5775e-01,
          2.9552e-01,  1.8816e-01, 

In [171]:
pe[:]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  0.0000e+00,  5.8992e-01,  0.0000e+00,  3.8767e-01,
          0.0000e+00,  2.4856e-01,  0.0000e+00,  1.5783e-01,  0.0000e+00,
          9.9833e-02,  0.0000e+00,  6.3054e-02,  0.0000e+00,  3.9800e-02,
          0.0000e+00,  2.5116e-02,  0.0000e+00,  1.5848e-02,  0.0000e+00,
          9.9998e-03,  0.0000e+00,  6.3095e-03,  0.0000e+00,  3.9811e-03,
          0.0000e+00,  2.5119e-03,  0

## BERT

This section is based on [original BERT Paper](https://arxiv.org/abs/1810.04805). Here is the [Github for the original BERT paper](https://github.com/google-research/bert).


## All The Ways You Can Compress BERT

This section is based on the [work by Mitchell G. Gordon](https://mitchgordon.me/machine/learning/2019/11/18/all-the-ways-to-compress-BERT.html).

## Literature Review

Remember to submit lit-review by **03-10** and apply penalty waiver for late submission.


### Reading list:
1. [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
2. [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
3. [MobileBERT: Task-Agnostic Compression of BERT by Progressive Knowledge Transfer](https://arxiv.org/abs/2004.02984)
4. [Q-BERT: Hessian Based Ultra Low Precision Quantization of BERT](https://arxiv.org/abs/1909.05840)
5. [Reducing Transformer Depth on Demand with Structured Dropout](https://arxiv.org/pdf/1909.11556.pdf)
6. [GLUE: A MULTI-TASK BENCHMARK AND ANALYSIS PLATFORM FOR NATURAL LANGUAGE UNDERSTANDING](https://openreview.net/pdf?id=rJ4km2R5t7)
7. [The Annotated Transformer](https://nlp.seas.harvard.edu/annotated-transformer/)
8. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
9. [WELL-READ STUDENTS LEARN BETTER: ON THE IMPORTANCE OF PRE-TRAINING COMPACT MODELS](https://arxiv.org/pdf/1908.08962.pdf)
10. MNLI Paper

### Submission
[My Literature Review](https://docs.google.com/document/d/101v9k7yc5brOkMI_MOUL7h28HKIlp6Ir3DAokwcxa3k)

## Schedule:
- ~**03/06**: 2 papers~  
- ~**03/07**: 2 papers~  
- ~**03/08**: 2 papers~  
- ~**03/09**: write lit-review~
- ~**03/10**: submit lit-review~ 
- ~**03/11**: experiment~
- ~**03/13**: experiment~
- ~**03/14**: experiment _Hypothesis, Dataset, Metrics_~
- ~**03/15**: experiment _Models, general reasoning_~
- ~**03/16**: experiment _Models, general reasoning_~
- ~**03/17**: experiment _Metrics, Models, general reasoning_~
- ~**03/18**: experiment _Models, general reasoning_~
- ~**03/19**: experiment _Models, general reasoning_~
- ~**03/20**: experiment and submit (*XCS330 1st day!*)~
- ~**03/21**: paper~
- ~**03/22**: paper~
- ~**03/23**: paper~
- ~**03/24**: paper~
- **03/25**: paper
- **03/27**: paper
- **03/28**: paper
- **03/29**: paper
- **03/30**: paper
- **03/31**: paper
- **04/01**: submit