<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/GCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
#Install pytorch and fastai (see https://docs.fast.ai/)
!conda update conda --yes
!conda install -c pytorch pytorch-nightly cuda90 --yes
!conda install -c fastai torchvision-nightly --yes
!conda install -c fastai fastai --yes
!conda install -c anaconda jupyter unzip cython cupy seaborn --yes
#!pip install GCNN_textfuncs
#!git clone --recursive https://github.com/DavidWBressler/GCNN

In [2]:
from fastai import *
from torch.utils.data.dataset import Dataset
from torch.utils.data.sampler import Sampler
import numpy as np

#create adaptive-softmax language-model dataset
class LMDataset_GCNN(Dataset):
    def __init__(self, tokens):
        self.tokens=tokens
    def __getitem__(self,index):
        #token_list=torch.FloatTensor(self.tokens[index]).cuda()
        token_list=torch.LongTensor(self.tokens[index])
        label=torch.FloatTensor([1])
        #label=torch.ones(len(token_list)-1).float()
        return token_list,label
    def __len__(self):
        return len(self.tokens)
    
    
class SortSampler_GCNN(Sampler): #inspired by fast.ai sortsampler... pass in something like key=lambda x: len(val_clas[x])
    def __init__(self, data_source, key): self.data_source,self.key = data_source,key
    def __len__(self): return len(self.data_source)
    def __iter__(self):
        return iter(sorted(range(len(self.data_source)), key=self.key, reverse=True))#return iterator in reverse order, sorted by input key (e.g. length)
    
#this sortishsampler does the following:
    # 1) get a list of randomized indices of length of entire dataset
    # 2) break that into a list of sublists, each sublist of size bs*50
    # 3) create a new list that is sorted within each of those chunks
    # 4) break that sorted list into chunks of size bs
    # 5) create new list by randomizing order of all those bs-chunks (with bs-chunk w/ largest key first)
#this will give batches sorted by key (e.g. length) within the batch
class SortishSampler_GCNN(Sampler): #inspired by fast.ai sortishsampler... pass in something like key=lambda x: len(val_clas[x])
    def __init__(self, data_length, key,bs): self.data_length,self.key,self.bs = data_length,key,bs
    def __len__(self): return self.data_length
    def __iter__(self):
        idxs = np.random.permutation(self.data_length)#random permutation of length of entire dataset
        sz = self.bs*50 #chunk size is bs*50
        #range(0, len(idxs), sz) : go through length of entire dataset, with stepsize=chunk_size
        #idxs[i:i+sz] :within that chunk's range, get all the indices of the random permutation above
        #this creates a list of lists... basically just splitting up idxs into a bunch of chunks
        ck_idx = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]
        #for s in ck_idx: go through each sublist in the big list
        #sorted(s, key=self.key, reverse=True): sort the sublist in reverse order according to the key (e.g. length)
        #np.concatenate: concatenate all the sorted chunk sublists together
        sort_idx = np.concatenate([sorted(s, key=self.key, reverse=True) for s in ck_idx])
        sz = self.bs #now set size to bs
        #similar as before, this creates a list of lists, splitting up sort_idx into chunks of size bs
        ck_idx = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]
        # go through each bs-chunk, get the key of the first entry (which should be the largest of the chunk)...
        # then do argmax to find the chunk with the largest key
        max_ck = np.argmax([self.key(ck[0]) for ck in ck_idx])  
        #switch spots bw the first chunk and the chunk w/ the max key
        ck_idx[0],ck_idx[max_ck] = ck_idx[max_ck],ck_idx[0]
        #now randomize the order of all the bs-chunks (except the first), then concatenate together
        sort_idx = np.concatenate(np.random.permutation(ck_idx[1:]))
        sort_idx = np.concatenate((ck_idx[0], sort_idx))# concatenate the first (largest key val) chunk to the rest
        return iter(sort_idx)

#inspired by fast.ai's pad_collate
def pad_collate_GCNN(samples, pad_idx=1):
    #go through all the sentences (found in s[0]), find length of longest
    max_len = max([len(s[0]) for s in samples]) 
    #create a tensor of size [max_len,n_samples], and set all values to pad_idx
    res = torch.zeros(max_len, len(samples)).long() + pad_idx 
    #for each line in res, set so corresponding sentence is aligned to the left edge
        #(right-padded: keep padding on the right edge)
    for i,s in enumerate(samples): res[:len(s[0]),i] = LongTensor(s[0]) #right-padded
    #return res as the padded tensor, and another tensor composed of the labels (found in s[1])
    return res.cuda(), torch.FloatTensor(np.array([s[1] for s in samples])).squeeze().cuda()

In [3]:
from fastai import *
from fastai.text import * 

import torch.utils.data as data_utils
from torch.utils.data.dataset import Dataset
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable

import time
import importlib
import seaborn as sns

#os.chdir("/content/GCNN/")
#from GCNN_textfuncs import *

In [4]:
DATAPATH = Path('/data/GCNN/')

In [5]:
sns.set() #set graph formatting to seaborn

In [6]:
#download wikitext-2 dataset and GloVe embeddings
!wget https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz -P /data
!tar xzf /data/wikitext-2.tgz -C /data
!mv /data/wikitext-2/ /data/GCNN/
!wget http://nlp.stanford.edu/data/glove.6B.zip -P /data/GCNN/
!unzip /data/GCNN/glove.6B.zip -d /data/GCNN/

--2021-08-27 17:23:12--  https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.166.77
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.166.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4070055 (3.9M) [application/x-tar]
Saving to: ‘/data/wikitext-2.tgz’


2021-08-27 17:23:13 (5.94 MB/s) - ‘/data/wikitext-2.tgz’ saved [4070055/4070055]

--2021-08-27 17:23:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-08-27 17:23:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs

In [7]:
#Do some preprocessing of the data:

#put data into df's w/ columns for 'labels' and 'text'
df_trn = pd.read_csv(DATAPATH/'train.csv',header=None,names=['text'])
df_test = pd.read_csv(DATAPATH/'test.csv',header=None,names=['text'])
df_trn['labels']=0
df_test['labels']=0
df_trn=df_trn[['labels','text']]
df_test=df_test[['labels','text']]

#split data into paragraphs, then remove paragraphs <10 or >300 words
trn_paragraphs=[]
for docnum in range(len(df_trn)):
    trn_paragraphs.extend([x for x in df_trn.iloc[docnum].text.split('\n')])
trn_paragraphs.sort(key=len)
trn_paragraphs=[par for par in trn_paragraphs if (len(par.split(' '))<300 and len(par.split(' '))>10)]#remove paragraphs >300 and <10 words
trn_paragraphs=[par+'xxeos ' for par in trn_paragraphs] #add EOS token at end of each paragraph

test_paragraphs=[]
for docnum in range(len(df_test)):
    test_paragraphs.extend([x for x in df_test.iloc[docnum].text.split('\n')])
test_paragraphs.sort(key=len)
test_paragraphs=[par for par in test_paragraphs if (len(par.split(' '))<300 and len(par.split(' '))>10)]#remove paragraphs >300 and <10 words
test_paragraphs=[par+'xxeos ' for par in test_paragraphs] #add EOS token at end of each paragraph

#put data into csv's
df_trn_par = pd.DataFrame({'text':trn_paragraphs})
df_test_par = pd.DataFrame({'text':test_paragraphs})

df_trn_par['labels']=0
df_test_par['labels']=0
df_trn_par=df_trn_par[['labels','text']]
df_test_par=df_test_par[['labels','text']]

df_trn_par.to_csv(DATAPATH/'train_proc_par2.csv', header=False, index=False)
df_test_par.to_csv(DATAPATH/'test_proc_par2.csv', header=False, index=False)

# Create modeler class

In [8]:
class modeler():
    def __init__(self,trn_dl,val_dl,module,modelvals=None):
        self.trn_dl, self.val_dl, self.module = trn_dl, val_dl, module
        self.modelvals=modelvals
        self.model=self.module.cuda()
    def model_fit(self):
        samp_n=self.modelvals['samp_n']#the number of iterations in an epoch
        starttime=time.time()
        train_loss_list=[]; val_loss_list=[]
        for epoch in range(0, self.modelvals['epochs']):
            pbar=0#progressbar
            for batch_idx, (data, target) in enumerate(self.trn_dl):
                
                #GRAB MINIBATCH OF INPUTS AND TARGETS, SET OPTIMIZER
                data = Variable(data)
                pbar+=self.modelvals['bs'] #how many iterations have we done in the epoch so far
                if self.modelvals['opttype']=='sgd':
                    self.optimizer = optim.SGD(self.model.parameters(), lr=self.modelvals['lr'], 
                                       momentum=self.modelvals['mom'], weight_decay=self.modelvals['wd'],
                                              nesterov=self.modelvals['nesterov'])
                elif self.modelvals['opttype']=='adam':
                    self.optimizer = optim.Adam(self.model.parameters(), lr=self.modelvals['lr'], 
                                        betas=(self.modelvals['mom'], 0.999))
                self.optimizer.zero_grad()
                
                #FORWARD PASS
                output = self.model(data)
                
                #CALCULATE AND BACKPROP THE LOSS
                loss= output.loss
                loss.backward()
                if self.modelvals['grad_clip']!=0: #gradient clipping
                    torch.nn.utils.clip_grad_value_(self.model.parameters(), self.modelvals['grad_clip'])
                    
                #UPDATE THE WEIGHTS
                self.optimizer.step()
                
                #PRINT OUT TRAINING UPDATES
                train_loss_list.append([epoch,pbar+epoch*samp_n,loss.data.item(),self.modelvals['lr']])
                if batch_idx % 100 == 0:
                    elapsed_time=time.time()-starttime
                    train_update_format_string = 'Train Epoch: {}'
                    train_update_format_string += '\tTotal_its: {:.2f}M [{:.2f}M/{:.2f}M]'
                    train_update_format_string += '\tPercdone: {:.2f}'
                    train_update_format_string += '\tLoss: {:.4f}'
                    train_update_format_string += '\tTime: {:.2f}'
                    train_update_format_string += '\tLR: {:.4f}'
                    train_update_string=train_update_format_string.format(
                            epoch,
                            (pbar + epoch * samp_n) / 1000000, pbar / 1000000, samp_n / 1000000,
                            pbar / samp_n,
                            loss.data.item(),
                            elapsed_time / 60,
                            self.modelvals['lr'])
                    print(train_update_string)
            final_train_loss=loss.data.item()
            
            #NOW TEST VALIDATION SET
            val_loss=[]
            self.model.eval() #important to set to eval mode for testing, so that eg batchnorm and dropout aren't used
            for batch_idx, (data, target) in enumerate(self.val_dl):
                data = Variable(data)
                self.optimizer.zero_grad()
                #ONLY NEED FORWARD PASS... NO BACKPROP
                output = self.model(data)
                loss= output.loss
                output=output.output
                val_loss.append(loss.data.item())
            self.model.train() #set back to training mode
            ave_val_loss=sum(val_loss) / len(val_loss)
            val_update_string='Validation Loss: {:.4f}\tPerp: {:.4f}'.format(
                ave_val_loss,np.exp(ave_val_loss))
            print(val_update_string)
            val_loss_list.append([epoch,ave_val_loss, np.exp(ave_val_loss),elapsed_time/60])
        self.modelvals['val_loss_list']=val_loss_list
        self.modelvals['train_loss_list']=train_loss_list
        print('The end! {:.2f} minutes'.format((time.time()-starttime)/60))

# Set hyperparameters and build embeddings

In [9]:
#set hyperparameters
bs=50 #batch-size
emb_sz=300 #size of the embedding matrix
nl=4 #number of layers
nh=600 #number hidden units
lr=1 #learning rate
mom=.95 #momentum
wd=5e-5 #weight-decay. Only has effect if opttype==sgd
epochs=50
nesterov=True #Nesterov momentum. only has effect if opttype==sgd
grad_clip=0.07 #gradient clipping value. Set to 0 for no effect. See nn.utils.clip_grad_value_
opttype='sgd' #adam, sgd
k=4 #kernel_width
downbot=20# in the bottleneck layers, how much to decrease channel depth?

In [10]:
#Use fast.ai to create a TextLMDataBunch object. See http://docs.fast.ai/text.data.html#class-textlmdatabunch
#This tokenizes and numericalizes the data
data_lm = TextLMDataBunch.from_csv(path=DATAPATH, csv_name='train_proc_par2.csv', test='test_proc_par2.csv')
itos=data_lm.train_ds.vocab.itos# the vocab
vs=len(itos)# vs is the length of the vocab

#Grab the numericalized data from the TextLMDataBunch dataset, then construct new custom dataset using LMDataset_GCNN
trn_tokens=[data_lm.train_ds[i][0].data for i in range(len(data_lm.train_ds))]
traindataset=LMDataset_GCNN(trn_tokens)
valid_tokens=[data_lm.valid_ds[i][0].data for i in range(len(data_lm.valid_ds))]
validdataset=LMDataset_GCNN(valid_tokens)

#Create data loaders for training and validation sets
trn_samp=SortishSampler_GCNN(data_length=len(traindataset),key=lambda x:len(traindataset[x][0]), bs=bs)
val_samp=SortSampler_GCNN(validdataset,key=lambda x:len(validdataset[x][0]))
train_loader = data_utils.DataLoader(traindataset, batch_size=bs, collate_fn=pad_collate_GCNN, sampler=trn_samp)
val_loader = data_utils.DataLoader(validdataset, batch_size=bs, collate_fn=pad_collate_GCNN,sampler=val_samp)
samp_n=len(traindataset)
val_samp_n=len(validdataset)

  return np.array(a, dtype=dtype, **kwargs)


In [11]:
#put hyperparameters into a dictionary
def get_modelvals():
    modelvals=dict((name,eval(name)) for name in [
        'lr','mom','wd','opttype','epochs','samp_n','val_samp_n',
        'bs','emb_sz','vs', 'nh', 'nl','DATAPATH','nesterov','grad_clip',
        'k','downbot'] )
    return modelvals

modelvals=get_modelvals()

In [12]:
#grab GloVe embeddings:
#create vocab itos2 from downloaded glove file
words = []
idx = 0
word2idx = {}
vectors = []
with open('/data/GCNN/glove.6B.300d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vectors.append(line[1:])
itos2=words

#grab the glove embeddings we need, based on the words in our vocab
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)}) #default -1 means its not in glove's itos2
row_m = vectors[-1] #this is default vector... for <unk>
new_w = np.zeros((vs, emb_sz), dtype=np.float32)#initialize new weights to zeros of size (vocab_size,embedding size) e.g. (60002,300)... we're creating an embedding matrix 
for i,w in enumerate(itos): #for index,word in our itos dict, get r index of the word in word2vec's dict. r will be -1 if it doesn't exist in word2vec's dict
    r = stoi2[w]#r index of the word in word2vec's dict
    new_w[i] = vectors[r] if r>=0 else row_m #for our new embedding matrix, set the embedding at the index from our dict equal to the embedding from index r from word2vec's dict
np.save(DATAPATH/'emb_wgts300_proc_par2.npy', new_w) #save the embedding weights

# Run GCNN

In [13]:
class GLUblock(nn.Module):
    def __init__(self, k, in_c, out_c, downbot):
        super().__init__()
        #only need to change shape of the residual if num_channels changes (i.e. in_c != out_c)
        #[bs,in_c,seq_length]->conv(1,in_c,out_c)->[bs,out_c,seq_length]
        if in_c == out_c:
            self.use_proj=0
        else:
            self.use_proj=1
        self.convresid=nn.utils.weight_norm(nn.Conv2d(in_c, out_c, kernel_size=(1,1)),name='weight',dim=0)
        
        self.leftpad = nn.ConstantPad2d((0,0,k-1,0),0)#(paddingLeft, paddingRight, paddingTop, paddingBottom)

        #[bs,in_c,seq_length+(k-1)]->conv(1,in_c,in_c/downbot)->[bs,in_c/downbot,seq_length+(k-1)]
        self.convx1a = nn.utils.weight_norm(nn.Conv2d(in_c, int(in_c/downbot), kernel_size=(1,1)),name='weight',dim=0)
        self.convx2a = nn.utils.weight_norm(nn.Conv2d(in_c, int(in_c/downbot), kernel_size=(1,1)),name='weight',dim=0)
        #[bs,in_c/downbot,seq_length+(k-1)]->conv(k,in_c/downbot,in_c/downbot)->[bs,in_c/downbot,seq_length]
        self.convx1b = nn.utils.weight_norm(nn.Conv2d(int(in_c/downbot), int(in_c/downbot), kernel_size=(k,1)),name='weight',dim=0)
        self.convx2b = nn.utils.weight_norm(nn.Conv2d(int(in_c/downbot), int(in_c/downbot), kernel_size=(k,1)),name='weight',dim=0)
        #[bs,in_c/downbot,seq_length]->conv(1,in_c/downbot,out_c)->[bs,out_c,seq_length]
        self.convx1c = nn.utils.weight_norm(nn.Conv2d(int(in_c/downbot), out_c, kernel_size=(1,1)),name='weight',dim=0)
        self.convx2c = nn.utils.weight_norm(nn.Conv2d(int(in_c/downbot), out_c, kernel_size=(1,1)),name='weight',dim=0)

    def forward(self, x):
        residual = x
        if self.use_proj==1:# if in_c != out_c, need to change size of residual
            residual=self.convresid(residual)
        x=self.leftpad(x) # [bs,in_c,seq_length+(k-1),1]
        x1 = self.convx1c(self.convx1b(self.convx1a(x))) # [bs,out_c,seq_length,1]
        x2 = self.convx2c(self.convx2b(self.convx2a(x))) # [bs,out_c,seq_length,1]
        x2 = torch.sigmoid(x2)
        x=torch.mul(x1,x2) # [bs,out_c,seq_length,1]
        return x+residual

In [14]:
class GCNNmodel(nn.Module):
    def __init__(self, vs, emb_sz, k, nh, nl,downbot):
    #def __init__(self, vs, emb_sz, k, nh, nl,dw,cutoffs):
        super().__init__()
        
        self.embed = nn.Embedding(vs, emb_sz)
        
        self.inlayer=GLUblock(k,emb_sz,nh,downbot)
        self.GLUlayers=self.make_GLU_layers(k,nh,nl,downbot)
        self.out=nn.AdaptiveLogSoftmaxWithLoss(nh, vs, cutoffs=[round(vs/25),round(vs/5)],div_value=4)

    def make_GLU_layers(self, k, nh, nl, downbot):
        layers = [GLUblock(k, nh, nh, downbot) for i in range(nl)]
        return nn.Sequential(*layers)
        
    def forward(self, x):
        
        target=x[1:,:]
        target=target.contiguous().view(target.size()[0]*target.size()[1])#[seq_length*bs,out_c]
        x=x[:-1,:]
        
        #first block
        x = self.embed(torch.t(x)) # x -> [seq_length,bs] -> [bs,seq_length] -> [bs,seq_length,emb_sz] ... i.e. transpose 1st
        x=torch.transpose(x, 1, 2) #[bs,emb_sz,seq_length]    
        x = x.unsqueeze(3)  # [bs,emb_sz,seq_length,1]
        x=self.inlayer(x) #[bs,nh,seq_length,1]
             
        #residual GLU blocks
        x=self.GLUlayers(x) # [bs,nh,seq_length,1]
        
        #out
        x=torch.squeeze(x,3) #[bs,out_c,seq_length]
        x=torch.transpose(x, 1, 2) #[bs,seq_length,out_c]
        x=torch.transpose(x, 0, 1) #[seq_length,bs,out_c]
        x=x.contiguous().view(-1,x.size()[2])#[seq_length*bs,out_c]
        outta=self.out(x,target)
        
        return    outta

In [15]:
#create GCNN 
GCNNnet=modeler(train_loader,val_loader,
                           GCNNmodel(vs, emb_sz, k, nh, nl, downbot),modelvals)
print(GCNNnet.model)

GCNNmodel(
  (embed): Embedding(28168, 300)
  (inlayer): GLUblock(
    (convresid): Conv2d(300, 600, kernel_size=(1, 1), stride=(1, 1))
    (leftpad): ConstantPad2d(padding=(0, 0, 3, 0), value=0)
    (convx1a): Conv2d(300, 15, kernel_size=(1, 1), stride=(1, 1))
    (convx2a): Conv2d(300, 15, kernel_size=(1, 1), stride=(1, 1))
    (convx1b): Conv2d(15, 15, kernel_size=(4, 1), stride=(1, 1))
    (convx2b): Conv2d(15, 15, kernel_size=(4, 1), stride=(1, 1))
    (convx1c): Conv2d(15, 600, kernel_size=(1, 1), stride=(1, 1))
    (convx2c): Conv2d(15, 600, kernel_size=(1, 1), stride=(1, 1))
  )
  (GLUlayers): Sequential(
    (0): GLUblock(
      (convresid): Conv2d(600, 600, kernel_size=(1, 1), stride=(1, 1))
      (leftpad): ConstantPad2d(padding=(0, 0, 3, 0), value=0)
      (convx1a): Conv2d(600, 30, kernel_size=(1, 1), stride=(1, 1))
      (convx2a): Conv2d(600, 30, kernel_size=(1, 1), stride=(1, 1))
      (convx1b): Conv2d(30, 30, kernel_size=(4, 1), stride=(1, 1))
      (convx2b): Conv2d(

In [16]:
#load the glove-vectors into the model
new_w=np.load(DATAPATH/'emb_wgts300_proc_par2.npy') #load embedding weights
GCNNnet.model.embed.weight.data=torch.FloatTensor(new_w).cuda()

In [17]:
GCNNnet.model_fit()

  return array(a, dtype, copy=False, order=order)


RuntimeError: ignored