## Utilities

In [1]:
import sys
def drawProgressBar(shell_out, 
                    begin, k, out_of, end, barLen =25):
    percent = k/float(out_of)
    sys.stdout.write("\r")
    progress = ""
    for i in range(barLen):
        if i < int(barLen * percent):
            progress += "="
        elif i==int(barLen * percent):
            progress +='>'
        else:
            progress += "_"
    text = "%s%d/%d[%s](%.2f%%)%s"%(begin,k,out_of,progress,percent * 100, end)
    if shell_out== True:
        sys.stdout.write(text)
        sys.stdout.flush()
    return text

## Data

https://github.com/Simdiva/DSL-Task

In [2]:
languages = ["bg","mk","bs","hr","sr","cz","sk","es-ES","es-AR","pt-BR","pt-PT","id","my","xx"]


label_to_index ={k:v for v,k in enumerate(languages)} 
index_to_label = {k:v for k,v in enumerate(languages)}

# print label_to_index['bs']
# print index_to_label[2]

In [3]:
train_path= "./DSL-Task/data/DSLCC-v2.0/train-dev/train.txt"
dev_path= "./DSL-Task/data/DSLCC-v2.0/train-dev/devel.txt"
test_path = "./DSL-Task/data/DSLCC-v2.0/test/test.txt"

In [4]:
import codecs
def load(path):
    with codecs.open(path,'rb','utf-8') as h:
        content = h.read()
    text_lines = content.split("\n")
    return text_lines

In [5]:
def character_splitting(text):
    char_list = list(text)
    for i,ch in enumerate(char_list):
        if ch == ' ':
            char_list[i]='<sp>'
    output = ' '.join(char_list)
    return output
#print character_splitting('This is a test')

In [6]:
def preprocess_lines(text_lines):
    text_labels = []
    for text_line in text_lines:
        if len(text_line)>0:
            text_line = text_line.strip()
            try:
                text,label = text_line.split("\t")
            except:
                text = text_line
                label = ""
            text_labels.append((text,label))
    return text_labels

In [7]:
train_lines = load(train_path)
dev_lines = load(dev_path)
test_lines = load(test_path)

In [8]:
train_samples = preprocess_lines(train_lines)
dev_samples = preprocess_lines(dev_lines)
test_samples = preprocess_lines(test_lines)
print "train_samples: %d"%len(train_samples)
print "dev_samples: %d"%len(dev_samples)
print "test_samples: %d"%len(test_samples)

train_samples: 252000
dev_samples: 28000
test_samples: 14000


## Build Lookup Table

In [9]:
n = 0
SOS_token = n
EOS_token = n+1
PAD_token = n+2

from collections import defaultdict
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = defaultdict(lambda: 0)
        self.index2word = {}
        self.index2word[SOS_token] = "SOS"
        self.index2word[EOS_token] = "EOS"
        self.index2word[PAD_token] = "PAD"
        self.n_words = len(self.index2word) # Count all words 
      
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# lang = Lang('all')
# lang.index_words('This is a test')
# print "number of words:%d"%lang.n_words

In [10]:
lang = Lang('languages')
for sample in train_samples +  dev_samples + test_samples:
    text = sample[0]
    lang.index_words(text)
    
print "number of vocabularies= %d"%lang.n_words

number of vocabularies= 1108619


## Replace words of sentences with indicies

In [11]:
from torch.autograd import Variable
MAX_TEXT_LEN = 5
def indicies_from_sentence(lang, sentence):
    output = [SOS_token]
    output += [lang.word2index[word] for word in sentence.split(' ')]
    if len(output)+1 >= MAX_TEXT_LEN:
        output = output[:MAX_TEXT_LEN-1]
        output += [EOS_token]
    else:
        len_gap = MAX_TEXT_LEN - len(output) -1
        pads = [PAD_token]*len_gap
        output += [EOS_token]
        output += pads
    
    return output

def variable_from_sentence(lang, sentence):
    indexes = indicies_from_sentence(lang, sentence)
    var = Variable(torch.LongTensor(indexes).view(-1, 1))
    return var

def variable_from_label(label):
    return Variable(torch.LongTensor([label_to_index[label]])).view(-1,1)

#print "indicies are:", indicies_from_sentence(lang,'this is')
# print "variable is:", variable_from_sentence(lang,'this is')

# CNN

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class  CNN_Text(nn.Module):
    def __init__(self,params):
        super(CNN_Text,self).__init__()
        
        V = params['embed_num']
        D = params['embed_dim']
        C = params['class_num']
        Ci = 1
        Co = params['kernel_num']
        Ks = params['kernel_sizes']

        self.embed = nn.Embedding(V, D)
        
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(params['dropout'])
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3) #(N,Co,W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x


    def forward(self, x):
        x = self.embed(x) # (N,W,D)
        
        x = x.unsqueeze(1) # (N,Ci,W,D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)


        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x) # (N,len(Ks)*Co)
        logit = self.fc1(x) # (N,C)
        return logit

In [13]:
import random
random.seed(0)
def get_batches(data_samples, batch_size):
    random.shuffle(data_samples)
    
    n =  len(data_samples)
    num_batches = n / batch_size
    data_samples = data_samples[:num_batches*batch_size]
    batches =[]
    for k in range(num_batches):
        batch = data_samples[k*batch_size:(k+1)*batch_size]
        batch_texts = [variable_from_sentence(lang,sample[0]) for sample in batch]
        batch_labels = [variable_from_label(sample[1]) for sample in batch]
       
        batch_texts = torch.stack(batch_texts,dim=0)
        batch_labels = torch.stack(batch_labels,dim=0)
       
        batches.append((batch_texts,batch_labels))
    return batches

# b0 =  get_batches(train_samples,5)[0]
# b0_texts ,b0_labels = b0[0],b0[1]
# print torch.stack(b0_labels).size()

## Evaluation Function

In [14]:
def eval(data_samples, model,batch_size):
    
    corrects, avg_loss = 0, 0
    
    data_batches = get_batches(data_samples, batch_size)
        
    for bi,batch in enumerate(data_batches):
        input_var, label_var = batch[0], batch[1] 
        
        input_var = input_var.view(batch_size,-1)
        
        label_var =label_var.view(batch_size)
        
        if torch.cuda.is_available():
            input_var, label_var = input_var.cuda(), label_var.cuda()

        predictions = model(input_var)
        loss = F.cross_entropy(predictions, label_var, size_average=False)

        avg_loss += loss.data[0]
        corrects += (torch.max(predictions, 1)
                     [1].view(label_var.size()).data == label_var.data).sum()

    size = len(data_samples)
    avg_loss = avg_loss/size
    accuracy = 100.0 * corrects/size
    
    return accuracy

    

## Train

In [17]:
import os
import sys
import torch
import torch.autograd as autograd
import torch.nn.functional as F
torch.cuda.set_device(1)

def train(train_samples, dev_samples, model):
    
    batch_size = 32
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    
    num_epochs =25
    
    steps = 0
    
    train_batches = get_batches(train_samples,batch_size)
    
    epoch_loss =0.0
    train_acc = 0.0
    dev_acc = 0.0
    
    for epoch in range(1, num_epochs+1):
        
        for bi,batch in enumerate(train_batches):

            input_var, label_var = batch[0], batch[1] 
            
            if torch.cuda.is_available():
                input_var, label_var = input_var.cuda(), label_var.cuda()
            
            optimizer.zero_grad()
            
            input_var = input_var.view(batch_size,-1)
            
            predictions = model(input_var)

            
            label_var =label_var.view(batch_size)
           
            loss = F.cross_entropy(predictions, label_var)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.data[0]
            
            steps += 1
            

            if steps % 20 == 0:
                 train_acc = eval(train_samples,model,1)
           
            if steps % 20 == 0:
                dev_acc = eval(dev_samples, model,1)
                
            if (epoch-1) % 5 ==0:
                drawProgressBar(True,'epoch:%d, '%(epoch),bi+1, len(train_batches),' loss:%.4f train_acc:%.2f, dev_acc:%.2f'%(
                                        epoch_loss/float(steps),
                                        train_acc,
                                        dev_acc))
        if (epoch-1) % 5 ==0:
            print "\n"
            

params = {}
params['embed_num'] = lang.n_words
params['embed_dim'] = 200
params['class_num'] = len(languages)
params['kernel_num'] = 100
params['kernel_sizes']=[3,4,5]
params['dropout']=0.5
cnn = CNN_Text(params)

if torch.cuda.is_available():
    cnn = cnn.cuda()
        
train(train_samples[:10000], dev_samples[100:200], cnn)






KeyboardInterrupt: 