## RBM Name Generation

In [1]:
import argparse
import pickle
from sklearn.model_selection import train_test_split
from GridEncoder import GridEncoder
import Utils
from ShortTextCodec import ShortTextCodec, BinomialShortTextCodec
from RBM import BernoulliRBM
import Sampling
import sample
import sys
import colorama
import numpy as np

In [3]:
BIASED_PRIOR = 0

class CharBernoulli(BernoulliRBM):

    def __init__(self, **kwargs):
        """
        codec is the ShortTextCodec used to create the vectors being fit. The
        most important function of the codec is as a proxy to the shape of the
        softmax units in the visible layer (if you're using the CharBernoulliRBMSoftmax
        subclass). It's also used to decode and print
        fantasy particles at the end of each epoch.
        """
        # Attaching this to the object is really helpful later on when models
        # are loaded from pickle in visualize.py and sample.py
        self.codec = kwargs.pop("codec")
        self.softmax_shape = codec.shape()
        # Old-style class :(
        BernoulliRBM.__init__(self, **kwargs)

    def wellness_check(self, epoch, duration, train, validation):
        BernoulliRBM.wellness_check(self, epoch, duration, train, validation)
        fantasy_samples = '|'.join([self.codec.decode(vec) for vec in
                                    self._sample_visibles(self.h_samples_[:3], temperature=0.1)])
        print ("Fantasy samples: {}".format(fantasy_samples))

    def corrupt(self, v):
        n_softmax, n_opts = self.softmax_shape
        # Select a random index in to the indices of the non-zero values of each input
        # TODO: In the char-RBM case, if I wanted to really challenge the model, I would avoid selecting any
        # trailing spaces here. Cause any dumb model can figure out that it should assign high energy to
        # any instance of /  [^ ]/
        meta_indices_to_corrupt = self.rng_.randint(0, n_softmax, v.shape[0]) + np.arange(0, n_softmax * v.shape[0], n_softmax)

        # Offset these indices by a random amount (but not 0 - we want to actually change them)
        offsets = self.rng_.randint(1, n_opts, v.shape[0])
        # Also, do some math to make sure we don't "spill over" into a different softmax.
        # E.g. if n_opts=5, and we're corrupting index 3, we should choose offsets from {-3, -2, -1, +1}
        # 1-d array that matches with meta_i_t_c but which contains the indices themselves
        indices_to_corrupt = v.indices[meta_indices_to_corrupt]
        # Sweet lucifer
        offsets = offsets - (n_opts * (((indices_to_corrupt % n_opts) + offsets.ravel()) >= n_opts))

        v.indices[meta_indices_to_corrupt] += offsets
        return v, (meta_indices_to_corrupt, offsets)

    def uncorrupt(self, visibles, state):
        mitc, offsets = state
        visibles.indices[mitc] -= offsets
        
class CharBernoulliSoftmax(CharBernoulli):
    def __init__(self,**kwargs):
        CharBernoulli.__init__(self, **kwargs)
        
    def _sample_visibles(self, h, temperature=1.0):
        """Sample from the distribution P(v|h). This obeys the softmax constraint
        on visible units. i.e. sum(v) == softmax_shape[0] for any visible
        configuration v.

        h : array-like, shape (n_samples, n_components)
            Values of the hidden layer to sample from.

        Returns
        -------
        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.
        """
        p = np.dot(h, self.components_/temperature)
        p += self.intercept_visible_/(min(1.0, temperature) if BIASED_PRIOR else temperature)
        nsamples, nfeats = p.shape
        reshaped = np.reshape(p, (nsamples,) + self.softmax_shape)
        return Utils.softmax_and_sample(reshaped).reshape((nsamples, nfeats))


In [4]:
codec_kls = ShortTextCodec
codec = codec_kls('',10,0,True,False)
codec.debug_description()
model_kwargs = {'codec': codec,
                        'n_components': 100,
                        'learning_rate': 0.1,
                        'lr_backoff': False,
                        'n_iter': 5,
                        'verbose': 1,
                        'batch_size': 10,
                        'weight_cost': 0.0001,
                        }
kls = CharBernoulliSoftmax
rbm = kls(**model_kwargs)

### English Names File

In [14]:
vecs = Utils.vectors_from_txtfile("./names.txt", codec)
train, validation = train_test_split(vecs, test_size=0.3)
print(train.shape,validation.shape)

(39800, 530) (17058, 530)


In [15]:
rbm.fit(train,validation)

Reusing existing weights and biases
[CharBernoulliSoftmax] Iteration 1/5	t = 5.97s
Pseudo-log-likelihood sum: -44249.33	Average per instance: -1.11
E(vali):	-11.48	E(train):	-11.52	difference: 0.04
Fantasy samples: Corinski$$|Stote$$$$$|Sleilan$$$
[CharBernoulliSoftmax] Iteration 2/5	t = 6.51s
Pseudo-log-likelihood sum: -41458.82	Average per instance: -1.04
E(vali):	-15.78	E(train):	-15.81	difference: 0.02
Fantasy samples: Maromarse$|Wintlar$$$|Sillimana$
[CharBernoulliSoftmax] Iteration 3/5	t = 6.52s
Pseudo-log-likelihood sum: -39801.02	Average per instance: -1.00
E(vali):	-18.65	E(train):	-18.70	difference: 0.05
Fantasy samples: Dee$$$$$$$|Kielea$$$$|Basgert$$$
[CharBernoulliSoftmax] Iteration 4/5	t = 6.50s
Pseudo-log-likelihood sum: -37956.15	Average per instance: -0.95
E(vali):	-21.23	E(train):	-21.31	difference: 0.08
Fantasy samples: Beyeman$$$|Willer$$$$|Marnelli$$


CharBernoulliSoftmax()

In [16]:
SAMPLES = []
def horizontal_cb(strings, i, energy=None):
    global SAMPLES
    if energy is not None:
        SAMPLES.append(zip(strings, energy))
    else:
        SAMPLES.append(strings)
def print_columns(maxlen):
    col_width = maxlen+2
    for fantasy_index in range(len(SAMPLES[0])):
        particles = [s[fantasy_index] for s in SAMPLES]
        print ("".join(s[fantasy_index].ljust(col_width) for s in SAMPLES))
sample_indices = [1000-1]
kwargs = dict(start_temp=1.0, final_temp=1.0, sample_energy=False, 
                    callback=horizontal_cb)

vis = Sampling.sample_model(rbm, 30, 1000, sample_indices, **kwargs)
print_columns(rbm.codec.maxlen)
fe = rbm._free_energy(vis)
print('Final energy: {:.2f} (stdev={:.2f})\n'.format(fe.mean(), fe.std()))

Pahes       
Ezams       
Tpeomen     
Yot         
Dittermand  
Lyyen       
Lish        
Rotte       
Renn        
Tabey       
Jeren       
Bonkerman   
Futterger   
Rakat       
Rei         
Elher       
Davet       
Rena        
Wuwen       
Hayf        
Chy         
Plat        
Luttn       
Fine        
Nella       
Meany       
Histeboale  
Neal        
Bloss       
Mihel       
Final energy: -29.02 (stdev=3.30)



## Spanish Name File

In [17]:
vecs = Utils.vectors_from_txtfile("./spanish.txt", codec)
train, validation = train_test_split(vecs, test_size=0.3)
print(train.shape,validation.shape)

(36523, 530) (15653, 530)


In [18]:
rbm.fit(train,validation)

Reusing existing weights and biases
[CharBernoulliSoftmax] Iteration 1/5	t = 5.63s
Pseudo-log-likelihood sum: -24017.08	Average per instance: -0.66
E(vali):	-38.12	E(train):	-38.18	difference: 0.07
Fantasy samples: carucar$$$|atecuntar$|pubetear$$
[CharBernoulliSoftmax] Iteration 2/5	t = 6.01s
Pseudo-log-likelihood sum: -21687.40	Average per instance: -0.59
E(vali):	-42.84	E(train):	-42.77	difference: -0.06
Fantasy samples: empiluchir|preparizar|altertar$$
[CharBernoulliSoftmax] Iteration 3/5	t = 5.99s
Pseudo-log-likelihood sum: -21348.57	Average per instance: -0.58
E(vali):	-43.01	E(train):	-43.14	difference: 0.12
Fantasy samples: poldar$$$$|enriipar$$|rescitar$$
[CharBernoulliSoftmax] Iteration 4/5	t = 6.02s
Pseudo-log-likelihood sum: -22206.58	Average per instance: -0.61
E(vali):	-48.46	E(train):	-48.52	difference: 0.06
Fantasy samples: brinar$$$$|maquetear$|cerrintear


CharBernoulliSoftmax()

In [21]:
SAMPLES = []
def horizontal_cb(strings, i, energy=None):
    global SAMPLES
    if energy is not None:
        SAMPLES.append(zip(strings, energy))
    else:
        SAMPLES.append(strings)
def print_columns(maxlen):
    col_width = maxlen+2
    for fantasy_index in range(len(SAMPLES[0])):
        particles = [s[fantasy_index] for s in SAMPLES]
        print ("".join(s[fantasy_index].ljust(col_width) for s in SAMPLES))
sample_indices = [1000-1]
kwargs = dict(start_temp=1.0, final_temp=1.0, sample_energy=False, 
                    callback=horizontal_cb)

vis = Sampling.sample_model(rbm, 30, 1000, sample_indices, **kwargs)
print_columns(rbm.codec.maxlen)
fe = rbm._free_energy(vis)
print('Final energy: {:.2f} (stdev={:.2f})\n'.format(fe.mean(), fe.std()))

travacar    
chamanar    
ajeyar      
emplefar    
emprocar    
trafanar    
zarrars     
embranar    
trajar      
adilar      
mubtirar    
sartabar    
emblolar    
embracar    
pretener    
promojir    
tretar      
subrecir    
empracar    
prorificar  
clobar      
silgonizar  
arajar      
merrerar    
ahasar      
pondar      
zablar      
tansuear    
atarar      
jirrar      
Final energy: -64.08 (stdev=4.31)

