<a href="https://colab.research.google.com/github/NNDLProject/ARDL/blob/main/DNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [120]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import re

from sklearn.datasets import fetch_20newsgroups

In [166]:
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_layers, hidden_dim, output_dim=20, dropout=0.5):
        super().__init__()
        self.hidden_layers = hidden_layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.hidden = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for i in range(hidden_layers)])
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        for layer in self.hidden:
            x = F.relu(layer(x))
        x = self.fc2(x)
#         x = F.softmax(self.fc2(x), dim=1)
        return x

In [168]:
# def build_random_dnn(input_dim, min_hidden_layer, max_hidden_layer, min_nodes, max_nodes, output_dim):

#     layer = list(range(min_hidden_layer,max_hidden_layer))
#     node = list(range(min_nodes, max_nodes))

#     num_layers = np.random.choice(layer)
#     num_nodes = np.random.choice(node)

#     dnn = DNN(input_dim, num_layers, num_nodes, output_dim)

#     return dnn

dnn = DNN(50, 1, 100, 20)
print(dnn)

DNN(
  (fc1): Linear(in_features=50, out_features=100, bias=True)
  (hidden): ModuleList(
    (0): Linear(in_features=100, out_features=100, bias=True)
  )
  (fc2): Linear(in_features=100, out_features=20, bias=True)
)


In [169]:
# Hyperparameters
num_epochs = 10
batch_size = 64
dropout = 0.5
learning_rate = 0.001

In [170]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(dnn.parameters(), lr=learning_rate)

In [145]:
# Provide list of categories to consider
categories = ['alt.atheism',
              'comp.graphics', 
              'comp.os.ms-windows.misc',
              'comp.sys.ibm.pc.hardware',  
              'comp.sys.mac.hardware',
              'comp.windows.x', 
              'misc.forsale', 
              'rec.autos', 
              'rec.motorcycles', 
              'rec.sport.baseball', 
              'rec.sport.hockey', 
              'sci.crypt', 
              'sci.electronics', 
              'sci.med', 
              'sci.space', 
              'soc.religion.christian', 
              'talk.politics.guns', 
              'talk.politics.mideast', 
              'talk.politics.misc', 
              'talk.religion.misc']

# Dictionary for merging similar classes together
dict_categories = {0: 0,
                   1: 1, 
                   2: 1,
                   3: 1,  
                   4: 1,
                   5: 1,
                   6: 2, 
                   7: 3, 
                   8: 3, 
                   9: 3, 
                   10: 3,
                   11: 4, 
                   12: 4, 
                   13: 4, 
                   14: 4,
                   15: 5, 
                   16: 6,
                   17: 6, 
                   18: 6, 
                   19: 6}

# Training subset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
# Testing subset
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('Total samples in training data:',len(newsgroups_train.data))
print('Total samples in testing data:',len(newsgroups_test.data))

Total samples in training data: 11314
Total samples in testing data: 7532


In [150]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target

## Add steps for preprocessing and embeddings
X_train = transform(X_train)

## create iterator objects for train datasets
X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

<torch.utils.data.dataloader.DataLoader object at 0x0000029167522520>


In [171]:
# Training
for epoch in range(num_epochs):
    running_loss = 0
    for data in trainloader:
        inputs, labels = data

        outputs = dnn(inputs)
        loss = criterion(outputs, labels)
        print(loss.data)

        loss.backward()
        optimizer.step()

        optimizer.zero_grad()

        running_loss += loss.item()

tensor(3.0044)
tensor(2.9936)
tensor(3.0022)
tensor(3.0011)
tensor(2.9997)
tensor(3.0107)
tensor(2.9965)
tensor(2.9846)
tensor(2.9855)
tensor(2.9942)
tensor(3.0027)
tensor(2.9821)
tensor(2.9804)
tensor(3.0020)
tensor(2.9961)
tensor(2.9780)
tensor(2.9805)
tensor(2.9806)
tensor(2.9963)
tensor(2.9761)
tensor(2.9765)
tensor(2.9936)
tensor(2.9995)
tensor(2.9668)
tensor(2.9766)
tensor(2.9628)
tensor(2.9670)
tensor(2.9790)
tensor(2.9535)
tensor(2.9618)
tensor(2.9718)
tensor(2.9446)
tensor(2.9691)
tensor(2.9423)
tensor(2.9610)
tensor(2.9490)
tensor(2.9524)
tensor(2.9514)
tensor(2.9427)
tensor(2.9433)
tensor(2.9549)
tensor(2.9379)
tensor(2.9468)
tensor(2.9464)
tensor(2.9233)
tensor(2.9366)
tensor(2.9253)
tensor(2.9483)
tensor(2.9198)
tensor(2.9247)
tensor(2.9429)
tensor(2.9228)
tensor(2.8966)
tensor(2.8999)
tensor(2.8930)
tensor(2.8689)
tensor(2.8857)
tensor(2.8676)
tensor(2.9290)
tensor(2.8806)
tensor(2.8541)
tensor(2.8479)
tensor(2.8500)
tensor(2.8514)
tensor(2.8128)
tensor(2.8362)
tensor(2.7

tensor(1.4426)
tensor(1.5859)
tensor(1.3877)
tensor(1.4560)
tensor(1.7587)
tensor(1.4593)
tensor(1.5825)
tensor(1.3879)
tensor(1.6079)
tensor(1.2795)
tensor(1.7045)
tensor(1.6679)
tensor(1.6531)
tensor(1.4458)
tensor(1.4151)
tensor(1.4142)
tensor(1.4981)
tensor(1.8153)
tensor(1.5086)
tensor(1.6936)
tensor(1.2174)
tensor(1.6663)
tensor(1.6521)
tensor(1.4986)
tensor(1.6231)
tensor(1.5266)
tensor(1.3805)
tensor(1.5316)
tensor(1.5283)
tensor(1.4161)
tensor(1.4590)
tensor(1.2844)
tensor(1.8383)
tensor(1.4964)
tensor(1.6766)
tensor(1.4160)
tensor(1.5203)
tensor(1.5284)
tensor(1.5927)
tensor(1.4261)
tensor(1.3523)
tensor(1.6707)
tensor(1.4289)
tensor(1.3559)
tensor(1.8334)
tensor(1.3796)
tensor(1.4629)
tensor(1.6185)
tensor(1.4640)
tensor(1.7777)
tensor(1.5653)
tensor(1.4610)
tensor(1.5102)
tensor(1.4366)
tensor(1.4913)
tensor(1.3309)
tensor(1.3795)
tensor(1.6425)
tensor(1.5221)
tensor(1.4991)
tensor(1.4633)
tensor(1.6371)
tensor(1.3505)
tensor(1.4246)
tensor(1.3100)
tensor(1.5954)
tensor(1.6

tensor(1.2867)
tensor(1.4302)
tensor(1.2360)
tensor(1.1376)
tensor(1.4899)
tensor(1.2154)
tensor(1.1925)
tensor(1.5436)
tensor(1.2211)
tensor(1.3217)
tensor(1.4585)
tensor(1.2348)
tensor(1.5405)
tensor(1.4319)
tensor(1.3031)
tensor(1.3193)
tensor(1.2807)
tensor(1.2803)
tensor(1.1710)
tensor(1.2484)
tensor(1.4138)
tensor(1.3248)
tensor(1.3617)
tensor(1.3100)
tensor(1.5434)
tensor(1.2246)
tensor(1.2573)
tensor(1.1297)
tensor(1.4422)
tensor(1.4121)
tensor(1.1778)
tensor(1.1438)
tensor(1.3456)
tensor(1.1649)
tensor(1.3364)
tensor(1.2312)
tensor(1.4118)
tensor(1.4088)
tensor(1.4258)
tensor(1.1984)
tensor(1.3208)
tensor(1.2695)
tensor(1.4281)
tensor(1.2454)
tensor(1.3680)
tensor(1.1849)
tensor(1.1383)
tensor(1.2941)
tensor(1.4414)
tensor(1.3452)
tensor(1.2637)
tensor(1.1589)
tensor(1.3437)
tensor(1.2248)
tensor(1.0894)
tensor(1.4689)
tensor(1.4575)
tensor(1.2118)
tensor(1.2198)
tensor(1.2673)
tensor(1.5087)
tensor(1.1825)
tensor(1.1250)
tensor(1.3685)
tensor(1.0403)
tensor(1.4883)
tensor(1.6

tensor(1.4688)
tensor(1.1616)
tensor(1.1562)
tensor(1.0332)
tensor(1.3696)
tensor(1.3041)
tensor(1.0777)
tensor(1.0492)
tensor(1.2723)
tensor(0.9837)
tensor(1.2359)
tensor(1.1495)
tensor(1.3346)
tensor(1.3499)
tensor(1.3835)
tensor(1.1201)
tensor(1.2450)
tensor(1.1965)
tensor(1.3339)
tensor(1.1323)
tensor(1.2796)
tensor(1.0753)
tensor(1.1143)
tensor(1.1924)
tensor(1.3427)
tensor(1.2198)
tensor(1.1973)
tensor(1.0775)
tensor(1.2583)
tensor(1.1758)
tensor(0.9958)
tensor(1.4264)
tensor(1.3354)
tensor(1.1012)
tensor(1.1400)
tensor(1.1922)
tensor(1.3742)
tensor(1.1229)
tensor(1.0610)
tensor(1.3093)
tensor(0.9745)
tensor(1.4418)
tensor(1.5332)
tensor(1.0467)
tensor(1.2511)
tensor(1.2058)
tensor(1.2175)
tensor(1.2200)
tensor(1.4573)
tensor(1.4785)
tensor(1.1399)
tensor(1.1180)
tensor(1.2244)
tensor(1.0114)
tensor(1.2919)
tensor(1.2030)
tensor(1.3071)
tensor(1.2651)
tensor(1.3591)
tensor(1.1276)
tensor(1.1331)
tensor(1.3834)
tensor(1.1133)
tensor(1.1980)
tensor(1.2069)
tensor(1.2027)
tensor(1.0

In [163]:
X_test = newsgroups_test.filenames
y_test = newsgroups_test.target

## Add steps for preprocessing and embeddings
X_test = transform(X_test)

## create iterator objects for valid datasets
X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [172]:
total, correct = 0, 0
with torch.no_grad():
    for data in testloader:
        inputs, labels = data

        outputs = dnn(inputs)

        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum()

print(f'Accuracy of the model is: {100*correct/total} %')

Accuracy of the model is: 5.284121036529541%


In [64]:
glove_dir = './glove.6B.50d.txt'

vocab = {}
with open(glove_dir, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        vocab[word] = vector
f.close()

print('Found %s word vectors.' %len(vocab))

Found 400000 word vectors.


In [102]:
def get_embeddings(text):
    embedding = [0]*50
    i = 0
    for word in text.split(' '):
        if word in vocab:
            i += 1
            embedding += vocab[word]
    
    if i != 0:
        embedding /= i
        
    return embedding

In [99]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = text.replace(':', '')
    text = text.replace(',', '')
    text = text.replace('-', '')
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    
    texter = re.sub(r'[^\w\s]', '', texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

In [100]:
def transform(X):
    embeddings = []
    for item in X:
        item = clean(item)
        embedding = get_embeddings(item)
        embeddings.append(embedding)
    
    return embeddings