In [1]:
import numpy as np
import random

class DataReader:
    def __init__(self,path,batch_size):
        with open(path,'r') as f:
            d_line=f.read().splitlines()
        self.x=[]
        self.y=[]
        self.batch_size=batch_size
        for d in d_line:
            fea=d.split('<fff>')
            label,sentence_len,data=int(fea[0]),int(fea[2]),fea[-1]
            self.y.append(label)
            vector=[int(number) for number in data.split()]
            self.x.append(np.array(vector))
        
        self.n_doc=len(d_line)
        self.n_batch=int(np.ceil(self.n_doc/batch_size))
        rd=random.sample(range(self.n_doc),self.n_doc)
        self.x=np.array(self.x)[rd]
        self.y=np.array(self.y)[rd]
        
        self.batch_id=0
        self.epoch=0
    
    def next_batch(self):
        self.batch_id+=1
        if(self.batch_id>=self.n_batch):
            self.batch_id=0
            self.epoch+=1
            rd=random.sample(range(self.n_doc),self.n_doc)
            self.x=self.x[rd]
            self.y=self.y[rd]
            
        start=self.batch_id*self.batch_size
        end=start+self.batch_size
        return self.x[start:end],self.y[start:end]

In [2]:
path_train='/content/drive/My Drive/Datasets/trainencode.txt'
path_test='/content/drive/My Drive/Datasets/testencode.txt'
path_vocab='/content/drive/My Drive/Datasets/vocab.txt'

with open(path_vocab,'r') as f:
    vocab_size=len(f.read().splitlines())


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class MyLSTM(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size,num_class,dropout):
        super(MyLSTM,self).__init__()
        self._device=torch.device('cuda:0')
        self._num_class=num_class
        self._hidden_size=hidden_size
        self._embed_size=embed_size
        self._embed=nn.Embedding(vocab_size,embed_size)
        self._lstm=nn.LSTMCell(
            input_size=self._embed_size,
            hidden_size=self._hidden_size,
        )
        self._FC=nn.Linear(hidden_size,num_class)
        self._dropout=nn.Dropout(dropout)
    
    def forward(self,x):
        batch_size,sentence_len=x.size()
        x=self._embed(x)
        #x=self._dropout(x)
        x = x.permute(1,0,2)
        h=Variable(torch.zeros(batch_size,self._hidden_size).to(self._device))
        c=Variable(torch.zeros(batch_size,self._hidden_size).to(self._device))
        hs=[]
        for i in range(sentence_len):
            h,c=self._lstm(x[i],(h,c))
            hs.append(h)
        final_hidden_state, _ = torch.max(torch.stack(hs, 0), 0)
        output = self._FC(final_hidden_state)
        return output


class RNN_text_classification:
    def __init__(self,vocab_size,embed_size,batch_size,num_class,hidden_size,lr=0.001,dropout=0.5):
        self._vocab_size=vocab_size
        self._batch_size=batch_size
        self._num_class=num_class
        self._embed_size=embed_size
        self._hidden_size=hidden_size
        self._lr=lr
        self._dropout=dropout
        self._data_reader=None
        self._labels=None
        self.net=None
        self._device=torch.device('cuda:0')
    
    def train_and_pre(self,path_train,path_test,max_iter):
        # load data
        self._data_reader=DataReader(path_train,50)
        self._data_test=DataReader(path_test,50)
        # build net
        self._net=MyLSTM(
            vocab_size=self._vocab_size,
            embed_size=self._embed_size,
            hidden_size=self._hidden_size,
            num_class=self._num_class,
            dropout=self._dropout
        )
        self._net.to(self._device)  # Use GPU
        # loss function
        self._criterion=nn.CrossEntropyLoss()
        # use Optimizer Adam
        self._optimizer=torch.optim.Adam(self._net.parameters(),self._lr)
        self._net.train() # train state
        it=0
        while it<max_iter:
            # read batch and convert data to tensor
            train_data,train_labels=self._data_reader.next_batch()
            train_data=torch.from_numpy(train_data).to(self._device)
            train_labels=torch.from_numpy(train_labels).to(self._device)
            # clear gradient
            self._optimizer.zero_grad()
            # train
            labels_pre=self._net(train_data)
            #calculate loss
            loss=self._criterion(labels_pre,train_labels)
            # calculate gradient
            loss.backward()
            #update parameter
            self._optimizer.step()

            if(self._data_reader.batch_id==0):
                print('Acc train = ',self.score(path_train).cpu().numpy(),
                      ' Acc test = ',self.score(path_test).cpu().numpy()
                )

            if(self._data_reader.batch_id%20==0):
                print("epoch {}, step {} loss = {}".format(
                    self._data_reader.epoch,self._data_reader.batch_id,loss.item()))
            it+=1
          
    def score(self,path_test):
        self._net.eval() # test state
        test_datareader=DataReader(path_test,50)
        num_pre_true=0
        while True:
            #load batch and convert data to tensor
            batch_test_data,batch_test_labels=test_datareader.next_batch()
            batch_test_data=torch.from_numpy(batch_test_data).to(self._device)
            batch_test_labels=torch.from_numpy(batch_test_labels).to(self._device)
            # test
            pre_vals = self._net(batch_test_data)

            num_pre_true+=(torch.max(pre_vals.data, 1)[1].view(batch_test_labels.size()).data==batch_test_labels.data).sum()
            if(test_datareader.epoch==1):
                break
    
        return num_pre_true*100.0/len(test_datareader.y)

In [None]:
import time
t=time.time()
model=RNN_text_classification(vocab_size,100,50,20,50,lr=0.01,dropout=0)
model.train_and_pre(path_train,path_test,5000)
print('time train= ',time.time()-t)

epoch 0, step 20 loss = 2.9239461421966553
epoch 0, step 40 loss = 2.8760311603546143
epoch 0, step 60 loss = 2.8160345554351807
epoch 0, step 80 loss = 2.5497121810913086
epoch 0, step 100 loss = 2.3964099884033203
epoch 0, step 120 loss = 2.1210341453552246
epoch 0, step 140 loss = 1.941153883934021
epoch 0, step 160 loss = 1.573564052581787
epoch 0, step 180 loss = 1.1989617347717285
epoch 0, step 200 loss = 1.3283618688583374
epoch 0, step 220 loss = 1.2352972030639648
Acc train =  73.78469  Acc test =  59.174187
epoch 1, step 0 loss = 0.7683546543121338
epoch 1, step 20 loss = 0.7651944160461426
epoch 1, step 40 loss = 0.8229925632476807
epoch 1, step 60 loss = 0.8404043316841125
epoch 1, step 80 loss = 0.8041346073150635
epoch 1, step 100 loss = 0.7095690965652466
epoch 1, step 120 loss = 0.6430947780609131
epoch 1, step 140 loss = 1.0236449241638184
epoch 1, step 160 loss = 0.7456151843070984
epoch 1, step 180 loss = 0.848235547542572
epoch 1, step 200 loss = 0.7775197625160217
