In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import re

from sklearn.datasets import fetch_20newsgroups

In [2]:
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_layers, hidden_dim, output_dim=20, dropout=0.5):
        super().__init__()
        self.hidden_layers = hidden_layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.hidden = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for i in range(hidden_layers)])
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        for layer in self.hidden:
            x = F.relu(layer(x))
#         x = self.fc2(x)
        x = F.softmax(self.fc2(x), dim=1)
        return x

In [3]:
# def build_random_dnn(input_dim, min_hidden_layer, max_hidden_layer, min_nodes, max_nodes, output_dim):

#     layer = list(range(min_hidden_layer,max_hidden_layer))
#     node = list(range(min_nodes, max_nodes))

#     num_layers = np.random.choice(layer)
#     num_nodes = np.random.choice(node)

#     dnn = DNN(input_dim, num_layers, num_nodes, output_dim)

#     return dnn

dnn = DNN(50, 1, 100, 7)
print(dnn)

DNN(
  (fc1): Linear(in_features=50, out_features=100, bias=True)
  (hidden): ModuleList(
    (0): Linear(in_features=100, out_features=100, bias=True)
  )
  (fc2): Linear(in_features=100, out_features=7, bias=True)
)


In [16]:
# Hyperparameters
num_epochs = 10
batch_size = 64
dropout = 0.5
learning_rate = 0.001

In [5]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(dnn.parameters(), lr=learning_rate)

In [6]:
# Provide list of categories to consider
categories = ['alt.atheism',
              'comp.graphics', 
              'comp.os.ms-windows.misc',
              'comp.sys.ibm.pc.hardware',  
              'comp.sys.mac.hardware',
              'comp.windows.x', 
              'misc.forsale', 
              'rec.autos', 
              'rec.motorcycles', 
              'rec.sport.baseball', 
              'rec.sport.hockey', 
              'sci.crypt', 
              'sci.electronics', 
              'sci.med', 
              'sci.space', 
              'soc.religion.christian', 
              'talk.politics.guns', 
              'talk.politics.mideast', 
              'talk.politics.misc', 
              'talk.religion.misc']

# Training subset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
# Testing subset
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('Total samples in training data:',len(newsgroups_train.data))
print('Total samples in testing data:',len(newsgroups_test.data))

Total samples in training data: 11314
Total samples in testing data: 7532


In [7]:
glove_dir = './glove.6B.50d.txt'

vocab = {}
with open(glove_dir, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        vocab[word] = vector
f.close()

print('Found %s word vectors.' %len(vocab))

Found 400000 word vectors.


In [8]:
def get_embeddings(text):
    embedding = [0]*50
    i = 0
    for word in text.split(' '):
        if word in vocab:
            i += 1
            embedding += vocab[word]
    
    if i != 0:
        embedding /= i
        
    return embedding

In [9]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    
    texter = re.sub(r'[^\w\s]', '', texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

In [10]:
def transform(X):
    embeddings = []
    for item in X:
        item = clean(item)
        embedding = get_embeddings(item)
        embeddings.append(embedding)
    
    return embeddings

In [11]:
dict_categories = {0: 0,
                   1: 1, 
                   2: 1,
                   3: 1,  
                   4: 1,
                   5: 1,
                   6: 2, 
                   7: 3, 
                   8: 3, 
                   9: 3, 
                   10: 3,
                   11: 4, 
                   12: 4, 
                   13: 4, 
                   14: 4,
                   15: 5, 
                   16: 6,
                   17: 6, 
                   18: 6, 
                   19: 6}

def transform_labels(y):
    results = []
    for item in y:
        results.append(dict_categories[item])
        
    return results

In [17]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target

## Add steps for preprocessing and embeddings
X_train = np.array(transform(X_train))
y_train = transform_labels(y_train)

## create iterator objects for train datasets
X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

In [18]:
# Training
for epoch in range(num_epochs):
    running_loss = 0
    for data in trainloader:
        inputs, labels = data

        outputs = dnn(inputs)
        loss = criterion(outputs, labels)
        print('Loss: %.4f' %loss.data.item())

        loss.backward()
        optimizer.step()

        optimizer.zero_grad()

        running_loss += loss.item()

Loss: 1.4300
Loss: 1.5564
Loss: 1.4488
Loss: 1.5058
Loss: 1.4837
Loss: 1.4248
Loss: 1.5259
Loss: 1.5031
Loss: 1.4444
Loss: 1.5058
Loss: 1.3780
Loss: 1.5084
Loss: 1.4617
Loss: 1.5372
Loss: 1.4739
Loss: 1.5082
Loss: 1.4635
Loss: 1.4525
Loss: 1.5011
Loss: 1.5416
Loss: 1.3944
Loss: 1.3570
Loss: 1.4673
Loss: 1.4818
Loss: 1.4374
Loss: 1.5008
Loss: 1.4113
Loss: 1.4307
Loss: 1.3850
Loss: 1.4726
Loss: 1.4620
Loss: 1.5049
Loss: 1.4835
Loss: 1.4137
Loss: 1.5111
Loss: 1.4436
Loss: 1.5278
Loss: 1.4276
Loss: 1.4993
Loss: 1.4252
Loss: 1.3489
Loss: 1.4392
Loss: 1.4164
Loss: 1.4664
Loss: 1.4361
Loss: 1.4182
Loss: 1.3493
Loss: 1.5018
Loss: 1.4772
Loss: 1.5707
Loss: 1.4162
Loss: 1.4540
Loss: 1.3990
Loss: 1.4632
Loss: 1.4117
Loss: 1.4749
Loss: 1.4838
Loss: 1.4070
Loss: 1.5418
Loss: 1.5105
Loss: 1.5626
Loss: 1.4676
Loss: 1.4324
Loss: 1.3839
Loss: 1.4872
Loss: 1.4032
Loss: 1.4078
Loss: 1.3936
Loss: 1.4128
Loss: 1.4648
Loss: 1.4482
Loss: 1.5003
Loss: 1.4937
Loss: 1.4106
Loss: 1.4658
Loss: 1.3667
Loss: 1.5601

Loss: 1.3481
Loss: 1.4412
Loss: 1.4025
Loss: 1.4562
Loss: 1.4752
Loss: 1.4021
Loss: 1.3715
Loss: 1.5024
Loss: 1.4038
Loss: 1.4477
Loss: 1.4258
Loss: 1.4656
Loss: 1.3995
Loss: 1.5975
Loss: 1.4011
Loss: 1.3773
Loss: 1.4578
Loss: 1.4866
Loss: 1.4459
Loss: 1.4555
Loss: 1.4062
Loss: 1.4036
Loss: 1.4169
Loss: 1.4651
Loss: 1.4017
Loss: 1.4607
Loss: 1.4740
Loss: 1.4403
Loss: 1.4729
Loss: 1.5101
Loss: 1.4551
Loss: 1.4327
Loss: 1.4769
Loss: 1.4138
Loss: 1.4631
Loss: 1.4592
Loss: 1.4064
Loss: 1.5300
Loss: 1.3254
Loss: 1.4788
Loss: 1.4144
Loss: 1.4905
Loss: 1.4661
Loss: 1.3470
Loss: 1.4263
Loss: 1.4863
Loss: 1.4097
Loss: 1.5371
Loss: 1.4828
Loss: 1.4199
Loss: 1.4495
Loss: 1.5229
Loss: 1.4932
Loss: 1.3608
Loss: 1.4071
Loss: 1.4028
Loss: 1.3411
Loss: 1.5957
Loss: 1.4862
Loss: 1.4824
Loss: 1.4613
Loss: 1.5174
Loss: 1.4325
Loss: 1.4759
Loss: 1.4689
Loss: 1.3969
Loss: 1.4965
Loss: 1.4624
Loss: 1.4141
Loss: 1.5613
Loss: 1.4468
Loss: 1.5043
Loss: 1.4768
Loss: 1.4154
Loss: 1.5159
Loss: 1.4914
Loss: 1.4399

Loss: 1.4866
Loss: 1.4338
Loss: 1.3553
Loss: 1.3981
Loss: 1.4127
Loss: 1.4122
Loss: 1.4787
Loss: 1.4323
Loss: 1.3867
Loss: 1.4672
Loss: 1.5797
Loss: 1.4362
Loss: 1.5535
Loss: 1.4233
Loss: 1.4799
Loss: 1.4016
Loss: 1.4097
Loss: 1.3890
Loss: 1.4547
Loss: 1.3113
Loss: 1.3738
Loss: 1.3987
Loss: 1.5146
Loss: 1.5013
Loss: 1.5011
Loss: 1.3533
Loss: 1.4214
Loss: 1.3784
Loss: 1.4234
Loss: 1.4911
Loss: 1.4545
Loss: 1.4010
Loss: 1.3437
Loss: 1.4387
Loss: 1.4032
Loss: 1.4501
Loss: 1.4733
Loss: 1.3887
Loss: 1.3787
Loss: 1.5043
Loss: 1.4079
Loss: 1.4472
Loss: 1.4169
Loss: 1.4596
Loss: 1.3961
Loss: 1.5968
Loss: 1.4019
Loss: 1.3669
Loss: 1.4527
Loss: 1.4958
Loss: 1.4381
Loss: 1.4553
Loss: 1.4028
Loss: 1.4073
Loss: 1.3998
Loss: 1.4582
Loss: 1.3991
Loss: 1.4539
Loss: 1.4701
Loss: 1.4348
Loss: 1.4534
Loss: 1.5085
Loss: 1.4541
Loss: 1.4286
Loss: 1.4761
Loss: 1.4105
Loss: 1.4596
Loss: 1.4464
Loss: 1.3957
Loss: 1.5266
Loss: 1.3267
Loss: 1.4753
Loss: 1.4182
Loss: 1.4837
Loss: 1.4712
Loss: 1.3491
Loss: 1.3994

In [19]:
X_test = newsgroups_test.filenames
y_test = newsgroups_test.target

## Add steps for preprocessing and embeddings
X_test = np.array(transform(X_test))
y_test = transform_labels(y_test)

## create iterator objects for valid datasets
X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [20]:
total, correct = 0, 0
with torch.no_grad():
    for data in testloader:
        inputs, labels = data

        outputs = dnn(inputs)

        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        
        correct += (predicted == labels).sum()

print(f'Accuracy of the model is: {100*correct/total:.2f}%')

Accuracy of the model is: 21.11%
