In [1]:
import string
import re
import math
from itertools import chain
import pandas
from tqdm import tqdm_notebook as tqdm

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
DATA_FILE = '../data/spam.csv'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def load_data(path):
    """
    Load spam data into a pandas dataframe
    """
    dataframe = pandas.read_table(path, delimiter=',', encoding='latin-1')
    return dataframe[['v1', 'v2']]

In [76]:
dataset = load_data(DATA_FILE)
print(dataset.head())

x = dataset.loc[116][1]
print(x)
preprocess(x)

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
You are a winner U have been specially selected 2 receive å£1000 or a 4* holiday (flights inc) speak to a live operator 2 claim 0871277810910p/min (18+) 
you are a winner u have been specially selected 2 receive å	s	dollarsign		s	number	 or a 4	 holiday 	flights inc	 speak to a live operator 2 claim 		longdigit		p	min 		number			 
['you', 'are', 'a', 'winner', 'u', 'have', 'been', 'specially', 'selected', '2', 'receive', 'å', 's', 'dollarsign', 's', 'number', 'or', 'a', '4', 'holiday', 'flights', 'inc', 'speak', 'to', 'a', 'live', 'operator', '2', 'claim', 'longdigit', 'p', 'min', 'number']


'winner u specially selected 2 receive å dollarsign number 4 holiday flight inc speak live operator 2 claim longdigit p min number'

In [41]:
STOPWORDS = set(stopwords.words('english'))
EMAIL_REGEX = r"[\w\.-]+@[\w\.-]+\.\w+"
URL_REGEX = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
LONG_DIGIT_REGEX = r"\b\d\d\d\d\d\d\d\d+\b"
MED_DIGIT_REGEX = r"\b0\d\d\d+\b"
SMALL_DIGIT_REGEX = r"\b0\d+\b"
NUMBER_REGEX = r"\b0|[1-9][0-9]+\b"
TOKEN_PATTERN = r"(?u)\b\w\w+\b|<\w*>|\?|\"|\'"
LEMMATIZER = WordNetLemmatizer()

In [77]:
test = '188888888888p fjdkjfkdj'

LONG_DIGIT_REGEX = r"\b\d\d\d\d\d\d\d\d+"
long_digit_regex = re.compile(LONG_DIGIT_REGEX, re.IGNORECASE)
text = long_digit_regex.sub(r'\t<longdigit>\t', test)

print(text)



	<longdigit>	p fjdkjfkdj


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [75]:
def preprocess(text):
    text = text.lower()
    
    text = text.replace('?', '\tregexquestionmark\t')
    text = text.replace('$', '\sregexdollarsign\s')
    text = text.replace('£', '\sregexdollarsign\s')
    text = text.replace('!', '\sregexexclamationmark\s')

    email_regex = re.compile(EMAIL_REGEX, re.IGNORECASE)
    text = email_regex.sub(r'\tregexemailaddress>\t', text)

    url_regex = re.compile(URL_REGEX, re.IGNORECASE)
    text = url_regex.sub(r'\t<url>\t', text)

    for letter in list(string.ascii_lowercase):
        letter_regex = re.compile(r"\t[{}]\t".format(letter), re.IGNORECASE)
        text = letter_regex.sub(r'\t<{}>\t'.format(letter), text)

    long_digit_regex = re.compile(LONG_DIGIT_REGEX, re.IGNORECASE)
    text = long_digit_regex.sub(r'\t<longdigit>\t', text)

    med_digit_regex = re.compile(MED_DIGIT_REGEX, re.IGNORECASE)
    text = med_digit_regex.sub(r'\t<mediumdigit>\t', text)

    small_digit_regex = re.compile(SMALL_DIGIT_REGEX, re.IGNORECASE)
    text = small_digit_regex.sub(r'\t<smalldigit>\t', text)

    number_regex = re.compile(NUMBER_REGEX, re.IGNORECASE)
    text = number_regex.sub(r'\t<number>\t', text)
    
    for punc in string.punctuation:
        text = text.replace(punc, '\t')
    
    text = text.split()
    text = [LEMMATIZER.lemmatize(w) for w in text]
    text = [w for w in text if w not in STOPWORDS]
    text = ' '.join(text)
    return text

In [48]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(dataset.v1)

vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 1), lowercase=True, preprocessor=preprocess, token_pattern=TOKEN_PATTERN)
inputs = vectorizer.fit_transform(dataset.v2).toarray()

DATA_SIZE = len(dataset)
VOCAB_SIZE = len(vectorizer.vocabulary_)
print(VOCAB_SIZE)
print(vectorizer.vocabulary_)

error: bad escape \s at position 0

In [16]:
TRAIN_SIZE = int(0.7 * DATA_SIZE)
VALID_SIZE = int(0.15 * DATA_SIZE)
TEST_SIZE = DATA_SIZE - VALID_SIZE - TRAIN_SIZE

END_VALID = TRAIN_SIZE + VALID_SIZE

train_inputs = torch.from_numpy(inputs[:TRAIN_SIZE]).type(torch.float32)

valid_inputs = torch.from_numpy(inputs[TRAIN_SIZE:END_VALID]).type(torch.float32)
valid_labels = torch.from_numpy(labels[TRAIN_SIZE:END_VALID]).type(torch.float32)

test_inputs = torch.from_numpy(inputs[END_VALID:]).type(torch.float32)
test_labels = torch.from_numpy(labels[END_VALID:]).type(torch.float32)

In [17]:
def xavier_initialization(layer):
    if isinstance(layer, nn.Linear):
        bound = 1 / math.sqrt(layer.in_features)
        layer.weight.data.uniform_(-bound, bound)
        layer.bias.data.zero_()

In [18]:
class Encoder(nn.Module):
    """Probabilistic Encoder. Returns the mean and the variance of z ~ q(z|x). The prior
    of z is assume to be normal(0, I).

    :params input_dim: number of features

    :return: mean and variance of the latent variable
    """
    def __init__(self, input_dim, latent_dim):
        super(Encoder, self).__init__()

        self.encoder_network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.BatchNorm1d(128),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.BatchNorm1d(64)
        )
        self.read_mu = nn.Linear(64, latent_dim)
        self.read_logvar = nn.Linear(64, latent_dim)
        self.apply(xavier_initialization)
    
    def forward(self, inputs):
        hidden_state = self.encoder_network(inputs)
        mean = self.read_mu(hidden_state)
        logvar = self.read_logvar(hidden_state)
        return mean, logvar

In [19]:
class Decoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Decoder, self).__init__()

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.BatchNorm1d(64),

            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.BatchNorm1d(128),

            nn.Linear(128, input_dim),
            nn.Tanh()
        )

        self.read_alpha = nn.Sequential(
            nn.Linear(latent_dim, input_dim),
            nn.ReLU6()
        )
        self.apply(xavier_initialization)

    def forward(self, z):
        alpha = 0.5 * self.read_alpha(z)
        loglambda = alpha * self.decoder(z) 
        return loglambda

In [20]:
class VAE(nn.Module):
    """
    VAE, x --> mu, log_sigma_sq --> N(mu, log_sigma_sq) --> z --> x
    """
    def __init__(self, input_dim, latent_dim, device=torch.device('cpu')):
        super(VAE, self).__init__()
        self.encoder = Encoder(input_dim, latent_dim).to(device)
        self.decoder = Decoder(input_dim, latent_dim).to(device)

    def parameters(self):
        return chain(self.encoder.parameters(), self.decoder.parameters())

    def sample_z(self, mu, logvar):
        epsilon = torch.randn_like(logvar)
        sigma = torch.exp(logvar / 2)
        return mu + sigma * epsilon

    def forward(self, inputs):
        mu, logvar = self.encoder(inputs)
        latent = self.sample_z(mu, logvar)
        theta = self.decoder(latent)
        return theta, mu, logvar

In [21]:
def kl_divergence(mu, logvar):
    """Compute Gaussian KL-divergence"""
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    return -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

def loss_fn(loglambda, inputs, *latent_params):
    """Cross entropy + KL divergence losses summed over all elements and batch"""
    cross_entropy = F.poisson_nll_loss(loglambda, target=inputs, reduction='sum')
    kl_div = kl_divergence(*latent_params)
    
    return cross_entropy + kl_div

In [23]:
train_loader = DataLoader(train_inputs, batch_size=128, shuffle=True)

In [14]:
MAX_ITER = 100
LATENT_DIM = 20

In [15]:
model = VAE(input_dim=VOCAB_SIZE, latent_dim=LATENT_DIM, device=DEVICE).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

#train_x = train_inputs.to(DEVICE)
#valid_x = valid_inputs.to(DEVICE)

In [16]:
best_valid_loss = float('Inf')
pbar = tqdm(range(MAX_ITER), ncols=1000, unit=' epoch')

def train_step(train_x):
    model.train()
    train_x = train_x.to(DEVICE)
    optimizer.zero_grad()
    loglambda, mu, logvar = model(train_x)
    loss = loss_fn(loglambda, train_x, mu, logvar)
    loss.backward()
    optimizer.step()
    
def eval_step(train_x, valid_x):
    train_x = train_x.to(DEVICE)
    valid_x = valid_x.to(DEVICE)
    with torch.no_grad():
        model.eval()
        train_output = model(train_x)
        valid_output = model(valid_x)
        
        train_kld = kl_divergence(*train_output[1:]).cpu() / train_x.shape[0]
        valid_kld = kl_divergence(*valid_output[1:]).cpu() / valid_x.shape[0]
        
        train_pce = F.poisson_nll_loss(train_output[0], target=train_x, reduction='sum').cpu() / train_x.shape[0]
        valid_pce = F.poisson_nll_loss(valid_output[0], target=valid_x, reduction='sum').cpu() / valid_x.shape[0]
        
        valid_loss = valid_pce + valid_kld
        
        template = 'Train PCE: {:.2f} | Train KLD: {:.2f} | Valid PCE: {:.2f} | Valid KLD {:.2f}'
        pbar.set_description(template.format(train_pce, train_kld, valid_pce, valid_kld))
        pbar.update()
    return valid_loss

for epoch in range(MAX_ITER):
    
    train_step(train_inputs)
    valid_loss = eval_step(train_inputs, valid_inputs)
    
        
    # early stopping
    if valid_loss > best_valid_loss:
        best_valid_loss = valid_loss
        # best_weights = deepcopy(model.state_dict())

HBox(children=(IntProgress(value=0, layout=Layout(flex='2')), HTML(value='')), layout=Layout(display='inline-f…

RuntimeError: CUDA out of memory. Tried to allocate 150.00 MiB (GPU 0; 2.00 GiB total capacity; 1.04 GiB already allocated; 19.39 MiB free; 162.19 MiB cached)

best_valid_loss = float('Inf')
pbar = tqdm(range(MAX_ITER), ncols=1000, unit=' epoch')

def train_step(loader):
    model.train()
    for batch_input in loader:
        optimizer.zero_grad()
        batch_input = batch_input.to(DEVICE)
        loglambda, mu, logvar = model(batch_input)
        loss = loss_fn(loglambda, batch_input, mu, logvar)
        loss.backward()
        optimizer.step()
    
def eval_step(train_x, valid_x):
    train_x = train_x.to(DEVICE)
    valid_x = valid_x.to(DEVICE)
    with torch.no_grad():
        model.eval()
        train_output = model(train_x)
        valid_output = model(valid_x)
        
        train_kld = kl_divergence(*train_output[1:]).cpu() / train_x.shape[0]
        valid_kld = kl_divergence(*valid_output[1:]).cpu() / valid_x.shape[0]
        
        train_pce = F.poisson_nll_loss(train_output[0], target=train_x, reduction='sum').cpu() / train_x.shape[0]
        valid_pce = F.poisson_nll_loss(valid_output[0], target=valid_x, reduction='sum').cpu() / valid_x.shape[0]
        
        valid_loss = valid_pce + valid_kld
        
        template = 'Train PCE: {:.2f} | Train KLD: {:.2f} | Valid PCE: {:.2f} | Valid KLD {:.2f}'
        pbar.set_description(template.format(train_pce, train_kld, valid_pce, valid_kld))
        pbar.update()
    return valid_loss

for epoch in range(MAX_ITER):
    
    train_step(train_loader)
    valid_loss = eval_step(train_inputs, valid_inputs)
    
        
    # early stopping
    if valid_loss > best_valid_loss:
        best_valid_loss = valid_loss
        # best_weights = deepcopy(model.state_dict())