In [None]:
# Make necessary imports
random_state = 0
import random
import numpy as np
random.seed(random_state)
np.random.seed(random_state)

import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups 

In [None]:
# Provide list of categories to consider
categories = ['alt.atheism',
              'comp.graphics', 
              'comp.os.ms-windows.misc',
              'comp.sys.ibm.pc.hardware',  
              'comp.sys.mac.hardware',
              'comp.windows.x', 
              'misc.forsale', 
              'rec.autos', 
              'rec.motorcycles', 
              'rec.sport.baseball', 
              'rec.sport.hockey', 
              'sci.crypt', 
              'sci.electronics', 
              'sci.med', 
              'sci.space', 
              'soc.religion.christian', 
              'talk.politics.guns', 
              'talk.politics.mideast', 
              'talk.politics.misc', 
              'talk.religion.misc']

# Training subset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
# Testing subset
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('Total samples in training data:',len(newsgroups_train.data))
print('Total samples in testing data:',len(newsgroups_test.data))

Total samples in training data: 11314
Total samples in testing data: 7532


In [None]:
print("Unique data targets: ", np.unique(newsgroups_train.target))

print("Unique data target names: \n", newsgroups_train.target_names)

Unique data targets:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
Unique data target names: 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [None]:
# Get index for each word
def get_word_2_index(vocab):
      word2index = {}
      for i, word in enumerate(vocab):
          word2index[word.lower()] = i
      return word2index

def get_embeddings_using_bow(newsgroups_train, newsgroups_test):
  """ Function to get vocabulary and indices for dataset """
  # Build a vocabulary
  vocab = Counter()

  # Iterate through training samples
  for text in newsgroups_train.data:
      for word in text.split(' '):
          vocab[word.lower()]+=1
  # Iterate through testing samples
  for text in newsgroups_test.data:
      for word in text.split(' '):
          vocab[word.lower()]+=1
  # Vocabulary size
  total_words = len(vocab)
  print("Vocabulary size [Bag-of-words]: ", total_words)

  word2index = get_word_2_index(vocab)

  return vocab, word2index

# Dictionary to map classes to numbers
# dict_categories = {'alt.atheism': 0,
#                    'comp.graphics': 1, 
#                    'comp.os.ms-windows.misc': 1,
#                    'comp.sys.ibm.pc.hardware': 1,  
#                    'comp.sys.mac.hardware': 1,
#                    'comp.windows.x': 1,
#                    'misc.forsale': 2, 
#                    'rec.autos': 3, 
#                    'rec.motorcycles': 3, 
#                    'rec.sport.baseball': 3, 
#                    'rec.sport.hockey': 3,
#                    'sci.crypt': 4, 
#                    'sci.electronics': 4, 
#                    'sci.med': 4, 
#                    'sci.space': 4,
#                    'soc.religion.christian': 5, 
#                    'talk.politics.guns': 6,
#                    'talk.politics.mideast': 6, 
#                    'talk.politics.misc': 6, 
#                    'talk.religion.misc': 6}

# Dictionary for merging similar classes together
dict_categories = {0: 0,
                   1: 1, 
                   2: 1,
                   3: 1,  
                   4: 1,
                   5: 1,
                   6: 2, 
                   7: 3, 
                   8: 3, 
                   9: 3, 
                   10: 3,
                   11: 4, 
                   12: 4, 
                   13: 4, 
                   14: 4,
                   15: 5, 
                   16: 6,
                   17: 6, 
                   18: 6, 
                   19: 6}

def get_batch(df, i, batch_size, vocab, word2index):
  """ Function to convert text into embeddings for a batch of data """
  batches = []
  results = []

  texts = df.data[i*batch_size : i*batch_size+batch_size]
  categories = df.target[i*batch_size : i*batch_size+batch_size]
  
  for text in texts:
    layer = np.zeros(len(vocab), dtype=float)
    for word in text.split(' '):
        layer[word2index[word.lower()]] += 1

    batches.append(layer)

  for category in categories:
    index_y = dict_categories[category]
    results.append(index_y)
  
  return np.array(batches),np.array(results)

In [None]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features

vocab, word2index = get_embeddings_using_bow(newsgroups_train, newsgroups_test)

input_size = len(vocab) # Words in vocab

num_classes = 7         # Categories: graphics, sci.space and baseball

from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
class OurNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out
# input [batch_size, n_labels]
# output [max index for each item in batch, ... ,batch_size-1]
loss = nn.CrossEntropyLoss()
input = Variable(torch.randn(2, 7), requires_grad=True)
print(">>> batch of size 2 and 5 possible classes")
print(input)
target = Variable(torch.LongTensor(2).random_(7))
print(">>> array of size 'batch_size' with the index of the maxium label for each item")
print(target)
output = loss(input, target)
output.backward()

Vocabulary size [Bag-of-words]:  591946
>>> batch of size 2 and 5 possible classes
tensor([[-1.3295,  1.3881, -0.2037,  1.4983,  0.6790,  0.7792, -0.2432],
        [-0.2431,  0.2079, -1.2231,  0.4054, -0.0385, -0.9551, -0.1980]],
       requires_grad=True)
>>> array of size 'batch_size' with the index of the maxium label for each item
tensor([0, 3])


In [None]:
net = OurNet(input_size, hidden_size, num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(newsgroups_train.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size,vocab, word2index)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        # print("articles",articles)
        # print(batch_x, labels)
        # print("size labels",labels.size())

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(articles)
        #print(outputs.shape, labels.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.data))


        

Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Epoch [1/10], Step [4/75], Loss: 2.0522
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Epoch [1/10], Step [8/75], Loss: 1.6054
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Epoch [1/10], Step [12/75], Loss: 1.1112
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Epoch [1/10], Step [16/75], Loss: 0.9405
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Epoch [1/10], Step [20/75], Loss: 0.4541
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Epoch [1/10], Step [24/75], Loss: 0.6930
Check:  (150, 591946) (150,)
Check:  (150, 591946) (150,)
Ch