In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import re

from sklearn.datasets import fetch_20newsgroups

In [2]:
# Provide list of categories to consider
categories = ['alt.atheism',
              'comp.graphics', 
              'comp.os.ms-windows.misc',
              'comp.sys.ibm.pc.hardware',  
              'comp.sys.mac.hardware',
              'comp.windows.x', 
              'misc.forsale', 
              'rec.autos', 
              'rec.motorcycles', 
              'rec.sport.baseball', 
              'rec.sport.hockey', 
              'sci.crypt', 
              'sci.electronics', 
              'sci.med', 
              'sci.space', 
              'soc.religion.christian', 
              'talk.politics.guns', 
              'talk.politics.mideast', 
              'talk.politics.misc', 
              'talk.religion.misc']

# Training subset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
# Testing subset
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('Total samples in training data:', len(newsgroups_train.data))
print('Total samples in testing data:', len(newsgroups_test.data))

Total samples in training data: 11314
Total samples in testing data: 7532


In [45]:
# glove_dir = './glove.6B.50d.txt'
# input_dim = 50
glove_dir = './glove.6B.300d.txt'
input_dim = 300

vocab = {}
with open(glove_dir, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        vocab[word] = vector
f.close()

print('Found %s word vectors.' %len(vocab))

Found 400000 word vectors.


In [47]:
def get_embeddings(text):
    embedding = np.zeros(input_dim, dtype=float)
    i = 0
    for word in text.split(' '):
        if word in vocab:
            i += 1
            embedding += vocab[word]
    
    if i != 0:
        embedding /= i
        
    return embedding

In [48]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    texter = re.sub(r'[^\w\s]', '', texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

In [49]:
dict_categories = {0: 0,
                   1: 1, 
                   2: 1,
                   3: 1,  
                   4: 1,
                   5: 1,
                   6: 2, 
                   7: 3, 
                   8: 3, 
                   9: 3, 
                   10: 3,
                   11: 4, 
                   12: 4, 
                   13: 4, 
                   14: 4,
                   15: 5, 
                   16: 6,
                   17: 6, 
                   18: 6, 
                   19: 6}

def transform(df):
    X = []
    y = []

    texts = df.data
    categories = df.target

    for text in texts:
        text = clean(text)

        X.append(get_embeddings(text))

    for category in categories:
        index_y = dict_categories[category]
        y.append(index_y)

    return np.array(X), np.array(y)

In [50]:
# Hyperparameters
num_epochs = 25
batch_size = 128
learning_rate = 0.001

In [51]:
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_layers, hidden_dim, output_dim=20, dropout=0.5):
        super(DNN, self).__init__()
        self.hidden_layers = hidden_layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.hidden = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for i in range(hidden_layers)])
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        for layer in self.hidden:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.fc2(x)
#         x = F.softmax(self.fc2(x), dim=1)
        return x

In [52]:
# def build_random_dnn(input_dim, min_hidden_layer, max_hidden_layer, min_nodes, max_nodes, output_dim):

#     layer = list(range(min_hidden_layer,max_hidden_layer))
#     node = list(range(min_nodes, max_nodes))

#     num_layers = np.random.choice(layer)
#     num_nodes = np.random.choice(node)

#     dnn = DNN(input_dim, num_layers, num_nodes, output_dim)

#     return dnn

num_layers = 1
num_nodes = 300
output_dim = 7
dropout = 0.25

dnn = DNN(input_dim, num_layers, num_nodes, output_dim, dropout)
print(dnn)

DNN(
  (fc1): Linear(in_features=300, out_features=300, bias=True)
  (hidden): ModuleList(
    (0): Linear(in_features=300, out_features=300, bias=True)
  )
  (fc2): Linear(in_features=300, out_features=7, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


In [53]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(dnn.parameters(), lr=learning_rate)

In [54]:
X_train, y_train = transform(newsgroups_train)

## create iterator objects for train datasets
X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train, dtype=torch.long)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

In [55]:
# Training
for epoch in range(num_epochs):
    running_loss = 0
    for i, data in enumerate(trainloader):
        inputs, labels = data

        outputs = dnn(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        optimizer.zero_grad()

#         running_loss += loss.item()
        
        if (i+1) % 50 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.data))

Epoch [1/25], Step [50/88], Loss: 1.0967
Epoch [2/25], Step [50/88], Loss: 0.6972
Epoch [3/25], Step [50/88], Loss: 0.5364
Epoch [4/25], Step [50/88], Loss: 0.5077
Epoch [5/25], Step [50/88], Loss: 0.4848
Epoch [6/25], Step [50/88], Loss: 0.4654
Epoch [7/25], Step [50/88], Loss: 0.4400
Epoch [8/25], Step [50/88], Loss: 0.4335
Epoch [9/25], Step [50/88], Loss: 0.3915
Epoch [10/25], Step [50/88], Loss: 0.4017
Epoch [11/25], Step [50/88], Loss: 0.3890
Epoch [12/25], Step [50/88], Loss: 0.3727
Epoch [13/25], Step [50/88], Loss: 0.4005
Epoch [14/25], Step [50/88], Loss: 0.3499
Epoch [15/25], Step [50/88], Loss: 0.3593
Epoch [16/25], Step [50/88], Loss: 0.2990
Epoch [17/25], Step [50/88], Loss: 0.3142
Epoch [18/25], Step [50/88], Loss: 0.2686
Epoch [19/25], Step [50/88], Loss: 0.2959
Epoch [20/25], Step [50/88], Loss: 0.2816
Epoch [21/25], Step [50/88], Loss: 0.2889
Epoch [22/25], Step [50/88], Loss: 0.2628
Epoch [23/25], Step [50/88], Loss: 0.2781
Epoch [24/25], Step [50/88], Loss: 0.2591
E

In [56]:
X_test, y_test = transform(newsgroups_test)

## create iterator objects for valid datasets
X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test, dtype=torch.long)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [57]:
total, correct = 0, 0
with torch.no_grad():
    for data in testloader:
        inputs, labels = data

        outputs = dnn(inputs)

        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        
        correct += (predicted == labels).sum()

print(f'Accuracy of the model is: {100*correct/total:.2f}%')

Accuracy of the model is: 81.32%
