In [1]:
import pandas as pd
import pickle
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import os
os.chdir("C:\\Users\\Jeremy\\Documents\\Code\\ml-tools-and-examples")
from metrics import *
import tensorflow as tf

# The Lottery Ticket Hypothesis: 
## Finding Sparse, Trainable Neural Networks
https://arxiv.org/pdf/1803.03635.pdf  

* Dense, randomly-initialized feed forward networks contain subnetworks (winning tickets) that reach test accuracy comparable to original network in similar number of iterations. 

Details:
- use early stopping for ending training (min validation loss)

Steps:
- 1. Randomly initialize neural network f(x; theta0)
- 2. Train network for j iterations, arriving at parameters thetaj
- 3. Prune percentage of parameters in theataj, creating a mask mask
- 4. Reset remaining parameters to values in theta0

Goals:
- 1. Improve training performance (prune as early as possible)
- 2. Design better networks: winning tickets = sparse architectures and initializations that are good at learning
- 3. Improve understanding of neural networks

Pruning strategy:
* layer-wise pruning heuristic (remove percentage of weights with lowest magnitudes within each layer)
* connections to outputs pruned at half rate of rest of network

## Data

In [2]:
seq_length = 600
vocab_size = 20000

In [None]:
def sklearn_to_df(sklearn_dataset):
    label_map = {index: x for index, x in enumerate(sklearn_dataset['target_names'])}
    df = pd.DataFrame(
        list(zip(sklearn_dataset['data'], sklearn_dataset['target'])),
        columns=["text", "label_id"],
    )
    df["label_name"] = df["label_id"].map(label_map)
    return df


In [1]:
def load_train_test():
    df_train = sklearn_to_df(fetch_20newsgroups(subset='train', shuffle=True,))
    df_test = sklearn_to_df(fetch_20newsgroups(subset='test', shuffle=True,))
    return df_train, df_test

## Feature Extraction

In [None]:
from keras.preprocessing.sequence import pad_sequences
from tokenizers import (ByteLevelBPETokenizer,
                            CharBPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)
import tokenizers
# from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

### Hugging face tokenizers

In [None]:
def encode_text(tokenizer, text: str, padding: int):
    output = tokenizer.encode(text)
    seqs = pad_sequences([output.ids], padding, padding="pre")
    return seqs
    
def create_vocab_file(texts: list, filename: str="vocab.txt"):
    raw_vocab = " ".join(texts)
    raw_vocab = raw_vocab.encode("utf-8")
    file1 = open(filename,"w")
    file1.write(str(raw_vocab))
    return filename

def train_bytebpe_tokenizer(vocab_size: int, vocab_filename: str):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train([vocab_filename], vocab_size=vocab_size)
#     tokenizer.save("examples", "20newsgroups_bytebpe-tokenizer")
    return tokenizer

def get_bert_wp_tokenizer(vocab_filename: str = "bert-large-uncased-vocab.txt"):
    tokenizer = BertWordPieceTokenizer(vocab_filename)
    return tokenizer

vocab_filename = "examples/20newsgroups_vocab.txt"

# tokenizer = get_bert_wp_tokenizer("D:\\Data\\vocabularies\\bert-large-cased-vocab.txt.txt")
tokenizer = train_bytebpe_tokenizer(vocab_size, vocab_filename)

### Load a tokenizer:

In [None]:
# vocab = '20newsgroups_bytebpe-tokenizer-vocab.json'
# merges = '20newsgroups_bytebpe-tokenizer-merges.txt'
# bpe = tokenizers.models.BPE.from_files(vocab, merges)
# tokenizer = tokenizers.Tokenizer(bpe)

In [None]:
tokenizer

In [None]:
def prepare_x(encoded_values: np.ndarray):
    x = np.stack(encoded_values.tolist())
    x = x.reshape((-1, x.shape[2]))
    return x
    
df_train['encoded_ids'] = df_train['text'].apply(lambda x: encode_text(tokenizer, x, seq_length))
df_test['encoded_ids'] = df_test['text'].apply(lambda x: encode_text(tokenizer, x, seq_length))


x_train = prepare_x(df_train['encoded_ids'])
x_test = prepare_x(df_test['encoded_ids'])

In [None]:
# df_train.head()
def prepare_y(df, label_id_col:str):
    y = tf.keras.backend.one_hot(df[label_id_col], df[label_id_col].nunique())
    return y

y_train = prepare_y(df_train, 'label_id')
y_test = prepare_y(df_test, 'label_id')

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


### Save all training data:

In [None]:
with open("train-data.pkl", "wb") as fp:
    pickle.dump((x_train, x_test, y_train, y_test), fp)

### Load all training data:

In [3]:
with open("train-data.pkl", "rb") as fp:
    x_train, x_test, y_train, y_test = pickle.load(fp)

In [4]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(11314, 600)
(7532, 600)
(11314, 20)
(7532, 20)


In [5]:
def plot_df_content(df):
    print(df['encoded_num_tokens'].describe())
    print(df['encoded_num_tokens'].plot.hist(bins=[100, 200,300,400,500,1000,1500,2000,3000,5000,20000, 50000]))
    print(df['label_id'].value_counts().plot(kind='bar'))
    
# plot_df_content(df_train)

## Model

In [6]:
import wandb
from wandb.keras import WandbCallback
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Embedding, AveragePooling1D, GlobalAveragePooling1D

In [7]:
def get_model(input_size: int, 
              num_words: int, 
              embedding_size:int=64,
              dense_1:int = 200,
              dense_2:int = 100,
#               pooling_1 = 100,
              output_shape: int=20
             ):
    model = tf.keras.Sequential()
    model.add(Embedding(num_words, embedding_size, input_length=input_size))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(dense_1, activation='relu'))
#     model.add(Dense(dense_2, activation='relu'))
#     model.add(BatchNormalization())
    model.add(Dense(output_shape, activation=tf.nn.softmax))
    model.compile(
        optimizer='SGD', 
        loss='categorical_crossentropy', 
        metrics=[
#             tf.keras.metrics.Recall(),
            tf.keras.metrics.Accuracy(),
#             tf.keras.metrics.Precision(),
        ]
    )
    return model

In [8]:
train_config = dict(
    epochs=50,
    batch_size=100,
)

config = dict(
#     batch_size=128,
    num_words = vocab_size,
    input_size=seq_length,
    embedding_size=100,
    dense_1=200,
    dense_2=100,
    output_shape=y_train.shape[1],
)

In [None]:
wandb.init(project="lottery-ticket_newsgroups", config=config)

# tf.executing_eagerly()

model = get_model(**config)

history = model.fit(
    x_train, y_train, 
    batch_size=train_config['batch_size'], 
    epochs=train_config['epochs'], 
    validation_split=0.2, 
#     steps_per_epoch=int(len(x_train)/config['batch_size'])-1,
    callbacks=[
        WandbCallback(),
      tf.keras.callbacks.EarlyStopping('val_loss'),
])

scores = model.evaluate(x_test, y_test, batch_size=32)

wandb.log({'test_'+metric: scores[index] for index, metric in enumerate(model.metrics_names)})
print("\n*** Finished! ***")

Train on 9051 samples, validate on 2263 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
 300/9051 [..............................] - ETA: 9s - loss: 2.9941 - accuracy: 0.0000e+00