In [43]:
from nltk.corpus      import stopwords
from nltk.stem        import WordNetLemmatizer
from collections      import Counter
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from gensim.parsing.preprocessing import (
    preprocess_string, 
    strip_tags, 
    strip_punctuation, 
    remove_stopwords, 
    strip_numeric, 
    strip_non_alphanum
)

import torch
import torch.nn as nn

### Hyper Paramenters:
    1) Sequence Length

In [2]:
# !pip install nltk
# !pip install gensim
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

## Class for Pre-Processing the News Text

In [3]:
class PreprocessNews:
    """
    A Class to clean the news text data
    """    
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.filters = [
            strip_tags,
            strip_numeric,
            strip_punctuation,
            strip_non_alphanum,
            lambda x: x.lower(),
            remove_stopwords
        ]
        
    def __call__(self, sentence):
        processed_sentence = self.clean(sentence)
        return processed_sentence
    
    def clean(self, sentence):
        clean_words = [ 
            self.wnl.lemmatize(word, 'v') 
            for word in preprocess_string(sentence, self.filters) 
        ]
        return " ".join(clean_words)

## Utililty Functions

In [4]:
def get_tag_index(tags):
    """
    A Fucntion that returns a 'tag to int' dictionary containig every unique tag
    """            
    all_tags = {
        t
        for tag in tags
        for t in tag.split(',')
    }
    return { tag: idx for idx, tag in enumerate(all_tags)  }

def get_tags_vector(tags):
    """
    A Function that converts a list of tags to a vector representation
    """    
    tag_index = get_tag_index(all_tags)
    tag_vector = np.zeros((len(tag_index),), dtype=int)
    for tag in tags.split(','):
        tag_vector[tag_index[tag]] = 1
        
    return tag_vector

In [5]:
def get_all_words(news):
    """
    A Function that returns a set of all unique words in the news Text
    """    
    all_words = set()
    for each_news in news:
        for word in each_news.split():
            all_words.add(word)
    
    return all_words

def tokenize(news, vocab_to_int):
    """
    A Function that converts words in text news into integers based on the vocab_to_int dictionary 
    """    
    tokenized_news = list()
    for each_news in news:
        temp = list()
        for word in each_news.split():
            temp.append(vocab_to_int[word])
        tokenized_news.append(np.array(temp))
        
    return np.array(tokenized_news)

def pad(news, required_length):
    """
    A Function that makes all news examples of required_length, padding the smaller
    and truncates the larger ones, from the end
    """    
    padded_news = list()
    for i, each_news in enumerate(news):
        
        each_news_len = len(each_news)
        if (required_length >= each_news_len):
            zeros   = np.zeros(required_length-each_news_len) 
            padded_news.append(np.concatenate([each_news, zeros]))
        else:
            padded_news.append(each_news[:required_length])
            pass
            
    return np.array(padded_news)

In [6]:
def create_embedding_matrix(vocab_to_int, embedding_dict, dimension):
    
    embedding_matrix = np.zeros((len(vocab_to_int)+1, dimension))

    for word, index in vocab_to_int.items():
        if word in embedding_dict:
            embedding_matrix[index] = embedding_dict[word]
        
    return embedding_matrix

In [7]:
classes = {
    0: 'Barely True',
    1: 'False '     ,
    2: 'Half True'  ,
    3: 'Mostly True',
    4: 'True'       ,
    5: 'Not Known'  ,
}

## Loading data

In [8]:
path = "./archive/train.csv"

In [9]:
df     = pd.read_csv(path)
df.dropna(inplace=True)

labels   = df['Labels'].to_numpy()
news     = df['Text'].apply(PreprocessNews()).to_numpy()
all_tags = df['Text_Tag'].to_numpy()

## Tokenizing and Padding News Text

In [10]:
all_words    = get_all_words(news)
count_words  = Counter(all_words)
total_words  = len(all_words)
sorted_words = count_words.most_common(total_words)

#We are starting our vocab_to_int conversion dictionary from 1 because we will use '0' for padding
vocab_to_int = { word:i+1 for i,(word, count) in enumerate(sorted_words) }

tokenized_news = tokenize(news, vocab_to_int)

In [11]:
#Example of Tokenized News
tokenized_news[8080]

array([1976, 6901, 7764, 5202,  941, 5398, 8150, 2844, 8453])

In [12]:
#Getting some statistics from the news data to find a suitable sequence_length
news_len = [ len(each_news) for each_news in tokenized_news ]

min_len  = np.min (news_len)
max_len  = np.max (news_len)
avg_len  = np.mean(news_len)
len_std  = np.std (news_len)
print(
        f"Minimum News Length: {min_len}\n" +
        f"Maximum News Length: {max_len}\n" +
        f"Average News Length: {avg_len}\n" +
        f"Standard Deviation of Length: {len_std}"
     )

Minimum News Length: 1
Maximum News Length: 316
Average News Length: 9.840398515335027
Standard Deviation of Length: 5.633679829817566


In [13]:
sequence_length = int(avg_len+len_std)
print(f"Using Sequence Length: {sequence_length}")

Using Sequence Length: 15


In [14]:
padded_news = pad(tokenized_news, sequence_length)

In [15]:
#Example of Padded Tokenized News
padded_news[8080]

array([1976., 6901., 7764., 5202.,  941., 5398., 8150., 2844., 8453.,
          0.,    0.,    0.,    0.,    0.,    0.])

## Creating the Embedding Matrix

In [16]:
glove           = pd.read_csv('./glove.6b/glove.6b.50d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove_embedding = { key: val.values for key, val in glove.T.items() }

In [17]:
embedding_dimension = 50 
embedding_matrix = create_embedding_matrix(vocab_to_int, glove_embedding, embedding_dimension)

In [18]:
#Example of Embedding
word = 'selective'
embedding_matrix[vocab_to_int[word]]

array([-0.46358 , -1.0009  , -0.43789 , -0.65805 , -0.25003 , -0.16574 ,
        0.68172 , -0.62092 ,  0.13461 ,  1.7284  ,  0.13523 , -0.2918  ,
        0.78272 , -0.39106 , -0.52691 ,  0.3621  , -0.43799 , -0.30998 ,
       -0.02848 , -0.065929,  0.54238 , -0.91495 , -0.053648,  0.18225 ,
       -0.83547 , -0.78016 ,  0.36905 , -0.06426 ,  0.14134 , -0.31794 ,
        1.8934  , -0.29255 ,  0.33738 , -1.2918  ,  0.59186 , -0.3477  ,
        0.48815 , -0.37124 , -0.46158 , -0.37741 , -0.58314 , -0.64557 ,
       -0.32752 ,  1.3523  ,  0.97032 , -0.87089 ,  0.89687 ,  0.6589  ,
        0.68262 ,  0.65271 ])

## Creating the Model

In [19]:
vocab_size  = embedding_matrix.shape[0]
vector_size = embedding_matrix.shape[1]

In [27]:
num_tags = len(get_tag_index(all_tags))

In [29]:
class LSTMNetwork(nn.Module):
    
    #constructor
    def __init__(self, hidden_1, hidden_2, hidden_3, num_layers, bidirectional, output_dim, p):
        super(NeuralNetwork, self).__init__()
        
        #Embedding Layer
        self.embedding        = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vector_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        #Recurrent Layers
        self.lstm = nn.LSTM(
                            input_size   =embedding_dimension, 
                            hidden_size  =hidden_1,
                            num_layers   =num_layers, 
                            bidirectional=False,
                            batch_first  =True,
                            dropout      =p
                           )
        
        #Fully Connected Layers
        self.linear1 = nn.Linear(hidden_1+num_tags, hidden_2)
        self.linear2 = nn.Linear(hidden_2, hidden_3)
        self.linear2 = nn.Linear(hidden_3, output_dim)
        self.relu    = nn.ReLU()
        
    
    #forward pass
    def forward(self, text, tags):
        embeddings = self.embeddings(text)
        x1         = self.lstm()(embeddings)
        print(type)
        return outputs

In [30]:
embedding   = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vector_size)

In [32]:
padded_news[0].shape


(15,)

In [41]:
embedding(torch.LongTensor(padded_news[0])).shape

torch.Size([15, 50])

In [66]:
len(padded_news)

10238

In [69]:
split_ratio = 0.8

train_news      = padded_news[:int(len(padded_news)*split_ratio)] 
validation_news = padded_news[int(len(padded_news)*split_ratio):]
train_news.shape

(8190, 15)

In [73]:
batch1 = train_news[:32]
batch1 = embedding(torch.LongTensor(batch1))
batch1.shape

torch.Size([32, 15, 50])

In [55]:
batch_size = 16
#Creating the DataLoadres
train_loader      = DataLoader(dataset=train_news, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(dataset=validation_news, batch_size=batch_size,   shuffle=True)

In [56]:
np.expand_dims(train_loader, 0)

array([<torch.utils.data.dataloader.DataLoader object at 0x000002286330CA60>],
      dtype=object)

In [51]:
embedding(torch.LongTensor(train_loader)).shape

TypeError: new(): data must be a sequence (got DataLoader)

In [71]:
for i in train_loader:
    print(type(i))
    break

<class 'list'>
