In [None]:
import numpy as np
import random

class DataReader:
    def __init__(self,path,batch_size):
        with open(path,'r') as f:
            d_line=f.read().splitlines()
        self.x=[]
        self.y=[]
        self.batch_size=batch_size
        for d in d_line:
            fea=d.split('<fff>')
            label,sentence_len,data=int(fea[0]),int(fea[2]),fea[-1]
            self.y.append(label)
            vector=[int(number) for number in data.split()]
            self.x.append(np.array(vector))
        
        self.n_doc=len(d_line)
        self.n_batch=int(np.ceil(self.n_doc/batch_size))
        rd=random.sample(range(self.n_doc),self.n_doc)
        self.x=np.array(self.x)[rd]
        self.y=np.array(self.y)[rd]
        
        self.batch_id=0
        self.epoch=0
    
    def next_batch(self):
        self.batch_id+=1
        if(self.batch_id>=self.n_batch):
            self.batch_id=0
            self.epoch+=1
            rd=random.sample(range(self.n_doc),self.n_doc)
            self.x=self.x[rd]
            self.y=self.y[rd]
            
        start=self.batch_id*self.batch_size
        end=start+self.batch_size
        return self.x[start:end],self.y[start:end]

In [None]:
path_train='/content/drive/My Drive/Datasets/trainencode.txt'
path_test='/content/drive/My Drive/Datasets/testencode.txt'
path_vocab='/content/drive/My Drive/Datasets/vocab.txt'

with open(path_vocab,'r') as f:
    vocab_size=len(f.read().splitlines())

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ConvNet(nn.Module):
    def __init__(self,num_class,vocab_size,embed_size,kernel_height):
        super(ConvNet,self).__init__()
        num_feamap=500  # channel out
        self.embed=nn.Embedding(vocab_size,embed_size)
        self.conv=nn.Conv2d(
            in_channels=1,
            out_channels=num_feamap,
            kernel_size=(kernel_height,embed_size),
            stride=1
        )
        self.dropout=nn.Dropout(0.5)
        self.fc=nn.Linear(num_feamap,num_class)

    def forward(self,x):
        # embedding data
        x=self.embed(x)  #[batch_size,sentence_len,embed_size]
        x=x.unsqueeze(1) #[batch_size,1,sentence_len,embed_size]
        x= F.relu(self.conv(x)).squeeze(3) # [batch_size,channel_size,heights_out]
        x = F.max_pool1d(x, x.size(2)).squeeze(2) #[batch_size,channels_size]
        x=self.dropout(x)
        logits=self.fc(x)
        return logits


class CNN_text_classification:
    def __init__(self,vocab_size,embed_size,batch_size,num_class
        ,kernel_heights,lr=0.001):
        self._vocab_size=vocab_size
        self._batch_size=batch_size
        self._num_class=num_class
        self._embed_size=embed_size
        self._kernel_heights=kernel_heights
        self._lr=lr
        self._device = torch.device('cuda:0')
        self._data_reader=None
        self._labels=None
        self.net=None
    
    def fit(self,path_train,max_iter):
        # read data
        self._data_reader=DataReader(path_train,50)
        self._net=ConvNet(self._num_class,self._vocab_size,self._embed_size,self._kernel_heights)
        # network devices set with gpu
        self._net.to(self._device)
        #loss function
        self._criterion=nn.CrossEntropyLoss()
        # optimizer Adam
        self._optimizer=torch.optim.Adam(self._net.parameters(),self._lr)
        # train state
        self._net.train()
        it=0
        while it<max_iter:
            # read data
            train_data,train_labels=self._data_reader.next_batch()
            # convert data to tensor
            train_data=torch.from_numpy(train_data).to(self._device)
            train_labels=torch.from_numpy(train_labels).to(self._device)

            # clear gradient
            self._optimizer.zero_grad()

            # train
            labels_pre=self._net(train_data)

            # calculate loss
            loss=self._criterion(labels_pre,train_labels)

            # calculate gradient
            loss.backward()

            # update parameters
            self._optimizer.step()

            if(self._data_reader.batch_id%50==0):
                print("epoch{}, step {} loss = {}".format(
                    self._data_reader.epoch,self._data_reader.batch_id,loss.item()))
            it+=1

    def score(self,path_test):
        self._net.eval() # train state
        self.test_datareader=DataReader(path_test,50) # read data
        num_pre_true=0
        while True:
            batch_test_data,batch_test_labels=self.test_datareader.next_batch()
            # convert data to tensor
            batch_test_data=torch.from_numpy(batch_test_data).to(self._device)
            batch_test_labels=torch.from_numpy(batch_test_labels).to(self._device)
            pre_vals = self._net(batch_test_data)
            num_pre_true+=(torch.max(pre_vals.data, 1)[1].view(batch_test_labels.size()).data==batch_test_labels.data).sum()
            if(self.test_datareader.epoch==1):
                break
    
        return (num_pre_true*100.0/len(self.test_datareader.y)).cpu().numpy()

In [None]:
import time
t=time.time()
model=CNN_text_classification(vocab_size+2,100,50,20,3,lr=0.001)
model.fit(path_train,6000)
print('time train= ',time.time()-t)
print('accuracy train= ',model.score(path_train))
print('accuracy test= ',model.score(path_test))

epoch0, step 50 loss = 3.1127166748046875
epoch0, step 100 loss = 2.8797240257263184
epoch0, step 150 loss = 2.7763733863830566
epoch0, step 200 loss = 2.7854814529418945
epoch1, step 0 loss = 2.6540956497192383
epoch1, step 50 loss = 2.1089468002319336
epoch1, step 100 loss = 2.09087872505188
epoch1, step 150 loss = 2.417496681213379
epoch1, step 200 loss = 1.8510149717330933
epoch2, step 0 loss = 1.6250605583190918
epoch2, step 50 loss = 1.8634411096572876
epoch2, step 100 loss = 1.3512171506881714
epoch2, step 150 loss = 1.9497559070587158
epoch2, step 200 loss = 1.3958373069763184
epoch3, step 0 loss = 1.0822604894638062
epoch3, step 50 loss = 0.9988605976104736
epoch3, step 100 loss = 1.4059709310531616
epoch3, step 150 loss = 1.381475806236267
epoch3, step 200 loss = 1.1934213638305664
epoch4, step 0 loss = 1.0425304174423218
epoch4, step 50 loss = 0.9299280643463135
epoch4, step 100 loss = 0.9522096514701843
epoch4, step 150 loss = 1.0394963026046753
epoch4, step 200 loss = 1.03