In [1]:
import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

total texts in train: 11314
total texts in test: 7532


In [4]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i
        
    return word2index

word2index = get_word_2_index(vocab)

In [5]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        else:
            index_y = 2
        results.append(index_y)
            
     
    return np.array(batches),np.array(results)

In [6]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = 20        # Categories: graphics, sci.space and baseball

In [7]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [9]:
class OurNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out


In [42]:


# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  
valid_loss_min = np.inf

# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(newsgroups_train.data)/batch_size)
    # Loop over all batches
    net.train()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        #print("articles",articles)
        #print(batch_x, labels)
        #print("size labels",labels.size())
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(articles)
        loss_train = criterion(outputs, labels)
        loss_train.backward()
        optimizer.step()
    
    total_batch = int(len(newsgroups_test.data)/batch_size)
    # Loop over all batches
    net.eval()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_test,i,batch_size)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        #print("articles",articles)
        #print(batch_x, labels)
        #print("size labels",labels.size())
        
        # Forward + Backward + Optimize
      # zero the gradient buffer
        outputs = net(articles)
        loss_test = criterion(outputs, labels)
        loss_test.backward()
        optimizer.step()
    print(loss_train, loss_test)
    
        
#         if (i+1) % 4 == 0:
#             print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
#                    %(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.item()))
    if loss_test <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        loss_test))
        torch.save(net.state_dict(), 'wieghts.pt')
        valid_loss_min = loss_test

tensor(0.0812, grad_fn=<NllLossBackward>) tensor(2.5008, grad_fn=<NllLossBackward>)
Validation loss decreased (inf --> 2.500799).  Saving model ...
tensor(2.0129, grad_fn=<NllLossBackward>) tensor(1.3882, grad_fn=<NllLossBackward>)
Validation loss decreased (2.500799 --> 1.388152).  Saving model ...
tensor(5.0470, grad_fn=<NllLossBackward>) tensor(0.8424, grad_fn=<NllLossBackward>)
Validation loss decreased (1.388152 --> 0.842373).  Saving model ...
tensor(0.3011, grad_fn=<NllLossBackward>) tensor(1.4784, grad_fn=<NllLossBackward>)
tensor(0.4252, grad_fn=<NllLossBackward>) tensor(1.2162, grad_fn=<NllLossBackward>)
tensor(0.6616, grad_fn=<NllLossBackward>) tensor(0.9796, grad_fn=<NllLossBackward>)
tensor(0.4460, grad_fn=<NllLossBackward>) tensor(0.5684, grad_fn=<NllLossBackward>)
Validation loss decreased (0.842373 --> 0.568381).  Saving model ...
tensor(0.5125, grad_fn=<NllLossBackward>) tensor(0.8270, grad_fn=<NllLossBackward>)
tensor(0.7114, grad_fn=<NllLossBackward>) tensor(1.0414, 

In [10]:
net = OurNet(input_size, hidden_size, num_classes)
net.load_state_dict(torch.load('wieghts.pt'))

<All keys matched successfully>

In [16]:
correct = 0
total = 0
total_batch = int(len(newsgroups_test.data)/batch_size)
for i in range(total_batch):
    batch_x,batch_y = get_batch(newsgroups_test,i,batch_size)
    articles = Variable(torch.FloatTensor(batch_x))
    labels = Variable(torch.LongTensor(batch_y))
        #print("articles",articles)
        #print(batch_x, labels)
        #print("size labels",labels.size())
        
        # Forward + Backward + Optimize
      # zero the gradient buffer
    outputs = net(articles)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
#     print(loss_train, loss_test)

In [17]:
print('Accuracy of the network on the 7532 test articles: %d %%' % (100 * correct / total))

Accuracy of the network on the 7532 test articles: 91 %


In [19]:
correct = 0
total = 0
total_batch = int(len(newsgroups_train.data)/batch_size)
for i in range(total_batch):
    batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
    articles = Variable(torch.FloatTensor(batch_x))
    labels = Variable(torch.LongTensor(batch_y))
        #print("articles",articles)
        #print(batch_x, labels)
        #print("size labels",labels.size())
        
        # Forward + Backward + Optimize
      # zero the gradient buffer
    outputs = net(articles)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
#     print(loss_train, loss_test)
print('Accuracy of the network on the 11314 train articles: %d %%' % (100 * correct / total))

Accuracy of the network on the 11314 train articles: 90 %
