## RBM Name Generation

In [1]:
import argparse
import pickle
from sklearn.model_selection import train_test_split
from GridEncoder import GridEncoder
import Utils
from ShortTextCodec import ShortTextCodec, BinomialShortTextCodec
from RBM import BernoulliRBM
import Sampling
import sample
import sys
import colorama
import numpy as np

In [22]:
BIASED_PRIOR = 0

class CharBernoulli(BernoulliRBM):

    def __init__(self, **kwargs):
        """
        codec is the ShortTextCodec used to create the vectors being fit. The
        most important function of the codec is as a proxy to the shape of the
        softmax units in the visible layer (if you're using the CharBernoulliRBMSoftmax
        subclass). It's also used to decode and print
        fantasy particles at the end of each epoch.
        """
        # Attaching this to the object is really helpful later on when models
        # are loaded from pickle in visualize.py and sample.py
        self.codec = kwargs.pop("codec")
        self.softmax_shape = codec.shape()
        # Old-style class :(
        BernoulliRBM.__init__(self, **kwargs)

    def wellness_check(self, epoch, duration, train, validation):
        BernoulliRBM.wellness_check(self, epoch, duration, train, validation)
        fantasy_samples = '|'.join([self.codec.decode(vec) for vec in
                                    self._sample_visibles(self.h_samples_[:3], temperature=0.1)])
        print ("Fantasy samples: {}".format(fantasy_samples))

    def corrupt(self, v):
        n_softmax, n_opts = self.softmax_shape
        # Select a random index in to the indices of the non-zero values of each input
        # TODO: In the char-RBM case, if I wanted to really challenge the model, I would avoid selecting any
        # trailing spaces here. Cause any dumb model can figure out that it should assign high energy to
        # any instance of /  [^ ]/
        meta_indices_to_corrupt = self.rng_.randint(0, n_softmax, v.shape[0]) + np.arange(0, n_softmax * v.shape[0], n_softmax)

        # Offset these indices by a random amount (but not 0 - we want to actually change them)
        offsets = self.rng_.randint(1, n_opts, v.shape[0])
        # Also, do some math to make sure we don't "spill over" into a different softmax.
        # E.g. if n_opts=5, and we're corrupting index 3, we should choose offsets from {-3, -2, -1, +1}
        # 1-d array that matches with meta_i_t_c but which contains the indices themselves
        indices_to_corrupt = v.indices[meta_indices_to_corrupt]
        # Sweet lucifer
        offsets = offsets - (n_opts * (((indices_to_corrupt % n_opts) + offsets.ravel()) >= n_opts))

        v.indices[meta_indices_to_corrupt] += offsets
        return v, (meta_indices_to_corrupt, offsets)

    def uncorrupt(self, visibles, state):
        mitc, offsets = state
        visibles.indices[mitc] -= offsets
        
class CharBernoulliSoftmax(CharBernoulli):
    def __init__(self,**kwargs):
        CharBernoulli.__init__(self, **kwargs)
        
    def _sample_visibles(self, h, temperature=1.0):
        """Sample from the distribution P(v|h). This obeys the softmax constraint
        on visible units. i.e. sum(v) == softmax_shape[0] for any visible
        configuration v.

        h : array-like, shape (n_samples, n_components)
            Values of the hidden layer to sample from.

        Returns
        -------
        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.
        """
        p = np.dot(h, self.components_/temperature)
        p += self.intercept_visible_/(min(1.0, temperature) if BIASED_PRIOR else temperature)
        nsamples, nfeats = p.shape
        reshaped = np.reshape(p, (nsamples,) + self.softmax_shape)
        return Utils.softmax_and_sample(reshaped).reshape((nsamples, nfeats))


In [24]:
codec_kls = ShortTextCodec
codec = codec_kls('',10,0,True,False)
codec.debug_description()
model_kwargs = {'codec': codec,
                        'n_components': 200,
                        'learning_rate': 0.1,
                        'lr_backoff': False,
                        'n_iter': 20,
                        'verbose': 1,
                        'batch_size': 10,
                        'weight_cost': 0.0001,
                        }
kls = CharBernoulliSoftmax
rbm = kls(**model_kwargs)

### English Names File

In [25]:
vecs = Utils.vectors_from_txtfile("./names.txt", codec)
train, validation = train_test_split(vecs, test_size=0.5)
print(train.shape,validation.shape)

(28429, 530) (28429, 530)


In [26]:
rbm.fit(train,validation)

[CharBernoulliSoftmax] Iteration 1/20	t = 10.38s
Pseudo-log-likelihood sum: -41056.42	Average per instance: -1.44
E(vali):	-27.73	E(train):	-27.72	difference: -0.01
Fantasy samples: Sareieee$$|Seeeleen$$|Sarl$$$$$$
[CharBernoulliSoftmax] Iteration 2/20	t = 11.17s
Pseudo-log-likelihood sum: -37722.40	Average per instance: -1.33
E(vali):	-24.23	E(train):	-24.23	difference: 0.00
Fantasy samples: Marerla$$$|Collinger$|Born$$$$$$
[CharBernoulliSoftmax] Iteration 3/20	t = 13.96s
Pseudo-log-likelihood sum: -34612.72	Average per instance: -1.22
E(vali):	-23.19	E(train):	-23.23	difference: 0.03
Fantasy samples: Marth$$$$$|Marring$$$|Marke$$$$$
[CharBernoulliSoftmax] Iteration 4/20	t = 11.15s
Pseudo-log-likelihood sum: -32326.75	Average per instance: -1.14
E(vali):	-22.46	E(train):	-22.53	difference: 0.06
Fantasy samples: Sonne$$$$$|Eller$$$$$|Bronanerl$
[CharBernoulliSoftmax] Iteration 5/20	t = 9.96s
Pseudo-log-likelihood sum: -30894.13	Average per instance: -1.09
E(vali):	-23.74	E(train):	-23.

CharBernoulliSoftmax()

In [6]:
SAMPLES = []
def horizontal_cb(strings, i, energy=None):
    global SAMPLES
    if energy is not None:
        SAMPLES.append(zip(strings, energy))
    else:
        SAMPLES.append(strings)
def print_columns(maxlen):
    col_width = maxlen+2
    for fantasy_index in range(len(SAMPLES[0])):
        particles = [s[fantasy_index] for s in SAMPLES]
        print ("".join(s[fantasy_index].ljust(col_width) for s in SAMPLES))
sample_indices = [1000-1]
kwargs = dict(start_temp=1.0, final_temp=1.0, sample_energy=False, 
                    callback=horizontal_cb)

vis = Sampling.sample_model(rbm, 30, 1000, sample_indices, **kwargs)
print_columns(rbm.codec.maxlen)
fe = rbm._free_energy(vis)
print('Final energy: {:.2f} (stdev={:.2f})\n'.format(fe.mean(), fe.std()))

Rydser      
Cobp        
Elihan      
Amirh       
Haon        
Hora        
Ebi         
Idul        
Adki        
Cyzel       
Bieh        
Adag        
Finn        
Iavan       
Ten         
Menterberg  
Eso         
Ajrr        
Weover      
Ecoma       
Iner        
Meag        
Art         
Felkue      
Shum        
Klox        
Ligek       
Akhley      
Naldr       
Enae        
Final energy: -33.61 (stdev=2.90)



## Spanish Name File

In [21]:
vecs = Utils.vectors_from_txtfile("./spanish.txt", codec)
train, validation = train_test_split(vecs, test_size=0.5)
print(train.shape,validation.shape)

(26088, 530) (26088, 530)


In [8]:
rbm.fit(train,validation)

Reusing existing weights and biases
[CharBernoulliSoftmax] Iteration 1/10	t = 13.71s
Pseudo-log-likelihood sum: -19258.85	Average per instance: -0.74
E(vali):	-37.71	E(train):	-37.82	difference: 0.12
Fantasy samples: conruruear|sorantar$$|desgambear
[CharBernoulliSoftmax] Iteration 2/10	t = 16.22s
Pseudo-log-likelihood sum: -16216.99	Average per instance: -0.62
E(vali):	-39.56	E(train):	-39.72	difference: 0.15
Fantasy samples: resrar$$$$|reqriar$$$|crafentear
[CharBernoulliSoftmax] Iteration 3/10	t = 14.03s
Pseudo-log-likelihood sum: -16578.11	Average per instance: -0.64
E(vali):	-38.99	E(train):	-39.10	difference: 0.11
Fantasy samples: supodar$$$|aperronar$|embrenar$$
[CharBernoulliSoftmax] Iteration 4/10	t = 16.56s
Pseudo-log-likelihood sum: -15322.08	Average per instance: -0.59
E(vali):	-41.39	E(train):	-41.60	difference: 0.21
Fantasy samples: lamponear$|acotar$$$$|recar$$$$$
[CharBernoulliSoftmax] Iteration 5/10	t = 13.62s
Pseudo-log-likelihood sum: -15067.42	Average per instance: 

CharBernoulliSoftmax()

In [20]:
SAMPLES = []
def horizontal_cb(strings, i, energy=None):
    global SAMPLES
    if energy is not None:
        SAMPLES.append(zip(strings, energy))
    else:
        SAMPLES.append(strings)
def print_columns(maxlen):
    col_width = maxlen+2
    for fantasy_index in range(len(SAMPLES[0])):
        particles = [s[fantasy_index] for s in SAMPLES]
        print ("".join(s[fantasy_index].ljust(col_width) for s in SAMPLES))
sample_indices = [1000-1]
kwargs = dict(start_temp=1.0, final_temp=1.0, sample_energy=False, 
                    callback=horizontal_cb)

vis = Sampling.sample_model(rbm, 30, 1000, sample_indices, **kwargs)
print_columns(rbm.codec.maxlen)
fe = rbm._free_energy(vis)
print('Final energy: {:.2f} (stdev={:.2f})\n'.format(fe.mean(), fe.std()))

todurar     
espurar     
espurar     
antumar     
sucurar     
encacar     
traspar     
zigurar     
espurar     
mizucar     
escurar     
espurar     
endorar     
empesar     
empubar     
esturar     
escurar     
visurar     
eszurar     
nequiar     
estupar     
lrandar     
empelar     
escurar     
livurar     
enturar     
enhular     
espurar     
excorar     
enlodar     
Final energy: -65.09 (stdev=3.84)



## Test with RBM Implementation

In [10]:
import sys
import os
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent =  currentdir + '\RBM_Git'
sys.path.insert(0,parent)
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import time
import matplotlib.pyplot as plt
import numexpr  as ne
import profile
import rbm as Rbm
import pandas
from random import randint
from timeit import default_timer as timer

In [11]:
codec_kls = ShortTextCodec
codec = codec_kls('',10,0,True,False)
vecs = Utils.vectors_from_txtfile("./names.txt", codec)
visible_dim = vecs.shape[1]
hidden_dim = 200
epochs = 100
K = 1
lr = 0.1
batch_size = 10

In [12]:
rbm_ = Rbm.RBM(visible_dim=visible_dim,
               hidden_dim=hidden_dim,
               seed=42,
               mu=0, 
               sigma=0.3,
               monitor_time=True)
rbm_.W.shape, rbm_.b.shape, rbm_

((530, 200), (530,), <rbm.RBM at 0x25bd93e9c88>)

In [13]:
test_Data_Vector_Aux = np.array(vecs.toarray(), dtype="float64")

In [14]:
%%time
rbm_.fit(test_Data_Vector_Aux, 
         method='CDK',
         K=K,
         lr=lr,
         epochs=1,
         batch_size=300,
         plot_weights=False)

	Last epoch:ime per epoch: 99.01	total time: 99.02 0 	time per epoch: 99.01	total time: 99.02
	Training finished


Wall time: 1min 39s


In [15]:
%%time
rbm_.fit(test_Data_Vector_Aux, 
         method='vectorized_CDK',
         K=K,
         lr=0.01,
         epochs=1,
         batch_size=128,
         plot_weights=False)

	Last epoch:ime per epoch: 2.83	total time: 2.83 0 	time per epoch: 2.83	total time: 2.83
	Training finished


Wall time: 2.85 s


In [16]:
%%time
rbm_.fit(test_Data_Vector_Aux, 
         method='vectorized_CDK',
         K=K,
         lr=0.01,
         epochs=500,
         batch_size=128,
         plot_weights=False)

	epoch: 0 	time per epoch: 2.83	total time: 2.84 1 	time per epoch: 3.01	total time: 5.86 2 	time per epoch: 2.66	total time: 8.53 3 	time per epoch: 2.64	total time: 11.17 4 	time per epoch: 2.51	total time: 13.68 5 	time per epoch: 2.53	total time: 16.21 6 	time per epoch: 2.51	total time: 18.73 7 	time per epoch: 2.59	total time: 21.32 8 	time per epoch: 2.49	total time: 23.82 9 	time per epoch: 2.55	total time: 26.37 10 	time per epoch: 2.48	total time: 28.86 11 	time per epoch: 2.61	total time: 31.47 12 	time per epoch: 2.48	total time: 33.96 13 	time per epoch: 2.55	total time: 36.52 14 	time per epoch: 2.51	total time: 39.03 15 	time per epoch: 2.61	total time: 41.64 16 	time per epoch: 2.96	total time: 44.60 17 	time per epoch: 2.97	total time: 47.58 18 	time per epoch: 2.68	total time: 50.28 19 	time per epoch: 2.80	total time: 53.08 20 	time per epoch: 2.81	total time: 55.90 21 	time per epoch: 2.82	total time: 58.73 22 	time per epoch: 2.78	total time: 61.51 23 	time per epo

	epoch: 186 	time per epoch: 3.04	total time: 597.98 187 	time per epoch: 2.77	total time: 600.76 188 	time per epoch: 2.66	total time: 603.43 189 	time per epoch: 2.88	total time: 606.31 190 	time per epoch: 2.73	total time: 609.04 191 	time per epoch: 2.68	total time: 611.73 192 	time per epoch: 2.68	total time: 614.41 193 	time per epoch: 2.60	total time: 617.02 194 	time per epoch: 2.66	total time: 619.68 195 	time per epoch: 2.73	total time: 622.42 196 	time per epoch: 2.62	total time: 625.05 197 	time per epoch: 2.66	total time: 627.71 198 	time per epoch: 3.16	total time: 630.88 199 	time per epoch: 2.66	total time: 633.55 200 	time per epoch: 3.05	total time: 636.61 201 	time per epoch: 3.00	total time: 639.61 202 	time per epoch: 2.88	total time: 642.51 203 	time per epoch: 2.62	total time: 645.13 204 	time per epoch: 2.61	total time: 647.75 205 	time per epoch: 2.84	total time: 650.60 206 	time per epoch: 2.74	total time: 653.34 207 	time per epoch: 2.94	total time: 656.29 20

	Last epoch:poch: 2.52	total time: 1101.06 368 	time per epoch: 2.63	total time: 1103.69 369 	time per epoch: 2.52	total time: 1106.21 370 	time per epoch: 2.59	total time: 1108.81 371 	time per epoch: 2.57	total time: 1111.38 372 	time per epoch: 3.08	total time: 1114.47 373 	time per epoch: 2.60	total time: 1117.08 374 	time per epoch: 2.53	total time: 1119.61 375 	time per epoch: 2.69	total time: 1122.30 376 	time per epoch: 2.55	total time: 1124.86 377 	time per epoch: 2.58	total time: 1127.45 378 	time per epoch: 2.54	total time: 1129.99 379 	time per epoch: 2.59	total time: 1132.59 380 	time per epoch: 2.54	total time: 1135.13 381 	time per epoch: 2.62	total time: 1137.75 382 	time per epoch: 2.52	total time: 1140.28 383 	time per epoch: 2.59	total time: 1142.87 384 	time per epoch: 2.60	total time: 1145.48 385 	time per epoch: 2.69	total time: 1148.18 386 	time per epoch: 3.02	total time: 1151.21 387 	time per epoch: 2.78	total time: 1154.00 388 	time per epoch: 2.71	total time:

In [17]:
word = codec.encode_onehot("Josh")
print(word.shape)

(530,)


In [18]:
x_hat, x_hat_p = rbm_.sample_visible_from_visible(word, n_gibbs=2000)
print(x_hat.shape, x_hat_p.shape)

(530,) (530,)


In [19]:
fW = codec.decode(x_hat)
print(fW)
# for i in range(10):
#     x_hat1,x_hat_p1 = rbm_.sample_visible_from_visible(x_hat, n_gibbs=2000)
#     print(codec.decode(x_hat1))

Dr??t?$$$$
