In [1]:
# Relevant imports

# Miscellaneous
import numpy as np 
import pandas as pd

# To encode values
from sklearn.preprocessing import LabelEncoder
# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
# confusion matrix
from sklearn.metrics import confusion_matrix
# train test split
from sklearn.model_selection import train_test_split

# Deep learning
import torch

# Others
import os
import re

## Dataset Description

The file contains 202,372 records. Each json record contains following attributes:

-  `category`: Category article belongs to
-  `headline`: Headline of the article 
-  `authors`: Person authored the article
-  `link`: Link to the post
-  `short_description`: Short description of the article
-  `date`: Date the article was published

In [2]:
# load data
df = pd.read_json('dataset/News_Category_Dataset_v2.json', lines=True)
df.head(10)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26
6,ENTERTAINMENT,Donald Trump Is Lovin' New McDonald's Jingle I...,Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,"It's catchy, all right.",2018-05-26
7,ENTERTAINMENT,What To Watch On Amazon Prime That’s New This ...,Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,There's a great mini-series joining this week.,2018-05-26
8,ENTERTAINMENT,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,"Myer's kids may be pushing for a new ""Powers"" ...",2018-05-26
9,ENTERTAINMENT,What To Watch On Hulu That’s New This Week,Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,You're getting a recent Academy Award-winning ...,2018-05-26


In [3]:
# Describe info of the dataset
df.describe()

  


Unnamed: 0,category,headline,authors,link,short_description,date
count,200853,200853,200853.0,200853,200853.0,200853
unique,41,199344,27993.0,200812,178353.0,2309
top,POLITICS,Sunday Roundup,,https://www.huffingtonpost.comhttp://stylelike...,,2013-01-17 00:00:00
freq,32739,90,36620.0,2,19712.0,100
first,,,,,,2012-01-28 00:00:00
last,,,,,,2018-05-26 00:00:00


In [4]:
# Checking for NaNs
df.isna().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

In [5]:
#List unique values in the df['name'] column
df.category.unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'FIFTY', 'ARTS',
       'WELLNESS', 'PARENTING', 'HOME & LIVING', 'STYLE & BEAUTY',
       'DIVORCE', 'WEDDINGS', 'FOOD & DRINK', 'MONEY', 'ENVIRONMENT',
       'CULTURE & ARTS'], dtype=object)

In [6]:
df.iloc[7]["short_description"]

"There's a great mini-series joining this week."

In [7]:
news_articles = df[df['date'] >= pd.Timestamp(2017,5,5)]
news_articles.shape

(27487, 6)

In [8]:
#List unique values in the df['name'] column
news_articles.category.unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS'], dtype=object)

In [9]:
# Filter headlines with less than 5 characters
news_articles = news_articles[news_articles['headline'].apply(lambda x: len(x.split())>5)]
news_articles.sort_values('headline',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('headline', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

Total number of articles after removing duplicates: 26384


In [10]:
print("Total number of articles : ", news_articles.shape[0])
print("Total number of authors : ", news_articles["authors"].nunique())
print("Total number of unqiue categories : ", news_articles["category"].nunique())

Total number of articles :  26384
Total number of authors :  3990
Total number of unqiue categories :  28


In [11]:
# Do Barplot per category

https://www.kaggle.com/vikashrajluhaniwal/recommending-news-articles-based-on-read-articles

In [12]:
# Articles per month
news_articles_per_month = news_articles.resample('m',on = 'date')['headline'].count()
news_articles_per_month

date
2017-05-31    2198
2017-06-30    2474
2017-07-31    2329
2017-08-31    2434
2017-09-30    2171
2017-10-31    2248
2017-11-30    2111
2017-12-31    1934
2018-01-31    2065
2018-02-28    1694
2018-03-31    1778
2018-04-30    1580
2018-05-31    1368
Freq: M, Name: headline, dtype: int64

### Filtering Data

In [13]:
# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# remove stop words
stop_words = set(stopwords.words('english'))
# List of stopwords to filter
print(stop_words)

{'such', 'only', "needn't", 'here', "shan't", "don't", 's', 'most', 'these', 'again', 'o', 'under', 'whom', 'each', 'd', 'it', "you're", 'm', 'have', 'into', 'very', 'so', 'to', 'were', 'won', 'more', 'now', 'she', 'the', 'until', 'your', 'aren', 'ours', "weren't", 'just', 'theirs', "haven't", 'ourselves', 'ain', 'herself', 'when', 'own', 'because', 'i', 'shan', "hasn't", 'shouldn', 'who', 'below', 'no', 'itself', 't', 'some', 'why', 'with', 'can', "won't", 'himself', 'them', 'not', 'against', 'ma', 'didn', 'wasn', 'there', 'he', 'during', 'all', 'don', 'doing', 'mustn', 'after', 'as', 'at', 'from', "doesn't", "shouldn't", 'had', 'before', 'is', 'being', 'you', 'our', 'few', 'that', 'and', 'will', 'which', 'those', 'if', 'hadn', "mustn't", 'having', 'do', 'y', 'doesn', 'other', 'its', 'did', 'mightn', 'yourself', 'between', "hadn't", 'a', 'my', "wouldn't", 'his', 'what', 'her', 'does', "she's", 'but', 'both', 'same', 'too', "you'd", "aren't", "didn't", "it's", 'myself', 'any', 'yours',

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sherlock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sherlock/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sherlock/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [15]:
# removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop_words:
            final_text.append(i.strip())
    return " ".join(final_text)

In [16]:
# removing non alphanumeric character
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

In [17]:
#https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
# Lemmatize words
lemmatizer = WordNetLemmatizer()
def rem_lemmatizer(text):
    final_text = []
    string_l = ""
    for w in word_tokenize(text):
        string_l += lemmatizer.lemmatize(w, pos = "v") + " "
    final_text.append(string_l.strip())
    return " ".join(final_text)

In [18]:
news_articles.head()

Unnamed: 0,category,headline,authors,link,short_description,date
21194,HEALTHY LIVING,“To The Bone” Didn’t Teach Me Glamour. It Taug...,"Mycah Hazel, Contributorblogger, equal opportu...",https://www.huffingtonpost.com/entry/to-the-bo...,"Oftentimes, films or TV shows about eating dis...",2017-07-18
2932,QUEER VOICES,‘Will & Grace’ Creator To Donate Gay Bunny Boo...,Elyse Wanshel,https://www.huffingtonpost.com/entry/will-grac...,It's about to be a lot easier for kids in Mike...,2018-04-02
25186,WORLD NEWS,‘We Are the Same Blood’: The Invisible Lives O...,"Sara Hylton, Women & Girls Hub",https://www.huffingtonpost.com/entry/we-are-th...,"A girl from the Dalit village of Harirajpur, i...",2017-05-31
26210,POLITICS,‘WannaCry’ Ransomware Attack Raises Alarm Bell...,"Stateline, ContributorStateline provides daily...",https://www.huffingtonpost.com/entry/wannacry-...,While the recent global cyberattack has spared...,2017-05-19
20973,ENTERTAINMENT,‘Walking Dead’ Reportedly Cancels Comic-Con Pr...,Bill Bradley,https://www.huffingtonpost.com/entry/walking-d...,But the panel will still happen.,2017-07-20


In [19]:
# Headlines TODO filter points...
news_articles['headline'] = news_articles['headline'].str.lower()
news_articles['headline'] = news_articles['headline'].apply(remove_stopwords)
news_articles['headline'] = news_articles['headline'].apply(alpha_num)
news_articles['headline'] = news_articles['headline'].apply(rem_lemmatizer)
# Description
news_articles['short_description'] = news_articles['short_description'].str.lower()
news_articles['short_description'] = news_articles['short_description'].apply(remove_stopwords)
news_articles['short_description'] = news_articles['short_description'].apply(alpha_num)
news_articles['short_description'] = news_articles['short_description'].apply(rem_lemmatizer)

news_articles['category'] = news_articles['category'].str.lower()
#news_articles['category'] = news_articles['category'].apply(lambda x: ''.join(x.split()))

news_articles.head()

Unnamed: 0,category,headline,authors,link,short_description,date
21194,healthy living,to bone didnt teach glamour teach respect,"Mycah Hazel, Contributorblogger, equal opportu...",https://www.huffingtonpost.com/entry/to-the-bo...,oftentimes film tv show eat disorder try convi...,2017-07-18
2932,queer voices,will grace creator donate gay bunny book every...,Elyse Wanshel,https://www.huffingtonpost.com/entry/will-grac...,lot easier kid mike pences home state read a d...,2018-04-02
25186,world news,we blood invisible live indias dalit women,"Sara Hylton, Women & Girls Hub",https://www.huffingtonpost.com/entry/we-are-th...,girl dalit village harirajpur odisha chase kit...,2017-05-31
26210,politics,wannacry ransomware attack raise alarm bell ci...,"Stateline, ContributorStateline provides daily...",https://www.huffingtonpost.com/entry/wannacry-...,recent global cyberattack spar federal governm...,2017-05-19
20973,entertainment,walk dead reportedly cancel comiccon press eve...,Bill Bradley,https://www.huffingtonpost.com/entry/walking-d...,panel still happen,2017-07-20


In [20]:
# Unique new categories
news_articles.category.unique()

array(['healthy living', 'queer voices', 'world news', 'politics',
       'entertainment', 'comedy', 'black voices', 'green', 'impact',
       'latino voices', 'business', 'women', 'weird news', 'taste',
       'crime', 'media', 'parents', 'travel', 'arts & culture', 'tech',
       'style', 'education', 'sports', 'religion', 'science',
       'the worldpost', 'college', 'good news'], dtype=object)

In [21]:
# most frequent unigrams of news belongs 'SCIENCE' category
def category_ngram(category, n):
    
    temp_df = news_articles[news_articles['category'] == category]
    
    word_vectorizer = CountVectorizer(ngram_range=(n, n), analyzer='word')
    sparse_matrix = word_vectorizer.fit_transform(temp_df['headline'])
    
    frequencies = sum(sparse_matrix).toarray()[0]
    
    return pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])\
            .sort_values(by='frequency', ascending=False) \
            .reset_index() \
            .head(10)

category_ngram('politics', 1)

Unnamed: 0,index,frequency
0,trump,3411
1,say,640
2,donald,620
3,gop,478
4,house,441
5,white,384
6,us,365
7,new,344
8,health,325
9,bill,311


In [22]:
# Reset indexes
news_articles = news_articles.reset_index(drop=True)
# Drop unwanted columns
news_articles.drop('authors', inplace=True, axis=1)
news_articles.drop('link', inplace=True, axis=1)
news_articles.drop('date', inplace=True, axis=1)
news_articles.drop('headline', inplace=True, axis=1)
news_articles.head()
# Encoding output
#labels_encoded = pd.get_dummies(news_articles.category, prefix='category')
#concatenated_dataframes = pd.concat([news_articles, labels_encoded], axis=1)

Unnamed: 0,category,short_description
0,healthy living,oftentimes film tv show eat disorder try convi...
1,queer voices,lot easier kid mike pences home state read a d...
2,world news,girl dalit village harirajpur odisha chase kit...
3,politics,recent global cyberattack spar federal governm...
4,entertainment,panel still happen


In [23]:
# create train and validation set \
# https://towardsdatascience.com/how-to-use-torchtext-for-neural-machine-translation-plus-hack-to-make-it-5x-faster-77f3884d95
train, val = train_test_split(news_articles, test_size = 0.2)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)

In [77]:
# https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/
import spacy
import torchtext
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator
en = spacy.load('en')

def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]

TEXT = Field(tokenize = tokenize_en, batch_first = False)
LABEL = LabelField(dtype = torch.long, batch_first = False)
#LABEL = Field(sequential=False, use_vocab=False, lower = True)



In [78]:
# associate the text in the 'English' column with the EN_TEXT field,
# and 'French' with FR_TEXT , ('category', LABEL)
#data_fields = [('category', LABEL), ('headline', TEXT), ('short_description', TEXT)]
data_fields = [('c', LABEL), ('s', TEXT)]

train, val = TabularDataset.splits(path='./', train='train.csv', validation='val.csv',
                                       format='csv', skip_header=True, fields = data_fields)



In [79]:
print(train[0].__dict__.keys())
print(train[0].__dict__.values())

dict_keys(['c', 's'])
dict_values(['world news', ['second', 'group', 'refugees', 'arrive', 'trump', 'call', 'dumb', 'deal']])


In [80]:
train[10].c[:10]

'politics'

In [81]:
vars(train[-1])

{'c': 'politics',
 's': ['nobody', 'put', 'hillary', 'corner', 'kamala', 'maxine', 'elizabeth']}

In [82]:
# Building the mapping
TEXT.build_vocab(train, min_freq = 10, vectors = "glove.6B.100d") # max_size=10000
LABEL.build_vocab(train, min_freq = 5)

In [83]:
print(TEXT.vocab.stoi['thank'])
print(TEXT.vocab.freqs.most_common(20))
print(LABEL.vocab.stoi['politics'])
print(LABEL.vocab.freqs.most_common(30))

553
[('say', 2501), ('trump', 1535), ('president', 946), ('i', 917), ('people', 873), ('new', 835), ('one', 817), ('nt', 809), ('us', 787), ('make', 775), ('get', 726), ('s', 659), ('state', 642), ('time', 640), ('go', 603), ('like', 602), ('take', 574), ('would', 551), ('come', 504), ('know', 488)]
0
[('politics', 7376), ('entertainment', 2936), ('world news', 1696), ('queer voices', 1101), ('comedy', 859), ('black voices', 780), ('healthy living', 732), ('parents', 631), ('media', 583), ('women', 565), ('sports', 480), ('weird news', 374), ('crime', 316), ('style', 315), ('green', 312), ('taste', 275), ('business', 255), ('impact', 236), ('latino voices', 224), ('religion', 201), ('travel', 167), ('the worldpost', 165), ('arts & culture', 164), ('education', 140), ('tech', 105), ('science', 79), ('college', 33), ('good news', 7)]


In [84]:
print(LABEL.vocab.stoi)

defaultdict(None, {'politics': 0, 'entertainment': 1, 'world news': 2, 'queer voices': 3, 'comedy': 4, 'black voices': 5, 'healthy living': 6, 'parents': 7, 'media': 8, 'women': 9, 'sports': 10, 'weird news': 11, 'crime': 12, 'style': 13, 'green': 14, 'taste': 15, 'business': 16, 'impact': 17, 'latino voices': 18, 'religion': 19, 'travel': 20, 'the worldpost': 21, 'arts & culture': 22, 'education': 23, 'tech': 24, 'science': 25, 'college': 26, 'good news': 27})


In [85]:
# Iterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#train_iter = BucketIterator(train, batch_size = 20, sort_key= lambda x: len(x.short_description), shuffle=True)
batch_size = 32
train_iter, val_iter = BucketIterator.splits(
 (train, val), # we pass in the datasets we want the iterator to draw data from
 batch_size = batch_size,
 device = device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.s), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False
 #repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)



In [86]:
for batch in train_iter:
    print(batch.c)
    #print(batch.headline)
    print(batch.s)
    print(batch.c.shape)
    #print(batch.headline.shape)
    print(batch.s.shape)
    break

tensor([ 0,  1,  2,  1,  0,  0,  5, 10,  2,  0,  0, 15,  3,  3,  0, 11,  0,  0,
         0,  0, 21,  6, 14,  2,  9,  0,  1, 19,  0,  1,  4,  6],
       device='cuda:0')
tensor([[ 600, 2987,    0, 2620,   22, 2585,  106,    0,  268,  107,  564,  296,
          264,    0,    8,   25, 1030, 1064,   12,    0, 2488,  371,  297,  678,
          549,   10,  951,    5,    3,    0, 1342, 1558],
        [ 737,    0,   32,   29,   16,  734,  962,  175,  915,  756,  933,  325,
            0,    8,  247, 1065,    0, 1545, 3007,   52,  323, 1645,   15,  547,
         1014,  266,  326,   17,   96,  528,   35,   96],
        [ 740,    0,    0,   70,  507,    0, 1740,  217,  175,  327, 2927, 3033,
          299, 2093,  619,   11, 2147,  336,  625, 2122,   59,  164,    0, 2047,
         1279, 2191,   42,   35,   11,    0,    0, 1380],
        [ 236, 1819,    0,   90,   38,   11,  182, 2432,   16,   52,    0, 1965,
            0,  117,   20,  105,   39, 1038, 2146, 1186,   84,    1,    1,  398,
         

In [37]:
batch = next(iter(train_iter))
#batch = next(iter(val_iter))
print(batch.c.shape)
print(batch.c.T)

torch.Size([32])
tensor([ 1,  3,  0,  0,  1, 15,  2,  1,  7,  0,  1,  1,  4,  1, 10,  1,  8,  3,
         0,  0,  1,  0,  0, 13, 13,  8,  1,  0, 25, 13,  1, 11],
       device='cuda:0')


In [35]:
# Wrapping the iterator
class BatchWrapper:
      def __init__(self, dl, x_var, y_vars):
            self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

      def __iter__(self):
            for batch in self.dl:
                  x_1 = getattr(batch, self.x_var[0]) # we assume only one input in this wrapper
                  x_2 = getattr(batch, self.x_var[1]) # we assume only one input in this wrapper
                  #print(x_1.shape)
                  #print(x_2.shape)
                  x = torch.cat((x_1, x_2), 0) # Concatenating variables
                  y = getattr(batch, self.y_vars) # Concatenating variables

                  yield (x, y)

      def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(train_iter, ["headline", "short_description"], "category")
valid_dl = BatchWrapper(val_iter, ["headline", "short_description"], "category")

In [36]:
print(next(train_dl.__iter__()))
print(next(valid_dl.__iter__()))

AttributeError: 'Batch' object has no attribute 'headline'

In [87]:
# Vocab size
print(len(LABEL.vocab))
print(len(TEXT.vocab))

28
3035


In [146]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
from torch.autograd import Variable

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, emb_dim = 400, hidden_dim = 264, layers = 2, drop_prob = 0.5):
        super(SimpleLSTMBaseline, self).__init__() # don't forget to call this!
        # Input
        self.input_size = len(TEXT.vocab)
        # Classifier
        self.output_size = len(LABEL.vocab)
        # Stacked LSTM
        self.n_layers = layers
        # Hidden units
        self.hidden_dim = hidden_dim
        # Embedding dim
        self.embedding_dim = emb_dim
        self.train_gpu = torch.cuda.is_available()
        
        self.embedding = nn.Embedding(self.input_size, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, hidden_dim, num_layers = self.n_layers, dropout = drop_prob)
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, self.output_size)
        # Activation function
        #self.smax =  nn.Softmax(dim = 1)

    def forward(self, x, hidden):
        
        batch_size = x.size(1)
        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # Remove sequence dimension
        #print('before',lstm_out.shape)
        lstm_out = lstm_out[-1, :, :]
        #print('after',lstm_out.shape)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        soft_out = self.fc(out)
        # Softmax function
        #soft_out =F.log_softmax(out, dim = 1)
        
        # reshape to be batch_size first
        #sig_out = sig_out.view(batch_size, -1)
        #sig_out = sig_out[:, -1] # get last batch of labels
        # reshape to be batch_size first
        #print('before soft', soft_out.shape)
        #soft_out = soft_out.view(-1, batch_size)
        #print('before soft', soft_out.shape)
        #soft_out = soft_out[-1, :] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return soft_out, hidden

    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        if (self.train_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

embedding_dim = 400
hidden_dim = 128
hidden_layers = 1
dropout_prob = 0.5
model = SimpleLSTMBaseline(emb_dim = embedding_dim, hidden_dim = hidden_dim, 
                             layers = hidden_layers, drop_prob = dropout_prob)
print(f'The model has {count_parameters(model):,} trainable parameters')
#Initialize the pretrained embedding
#pretrained_embeddings = TEXT.vocab.vectors
#model.embedding.weight.data.copy_(pretrained_embeddings)
model.to(device)

The model has 1,488,972 trainable parameters


SimpleLSTMBaseline(
  (embedding): Embedding(3035, 400)
  (lstm): LSTM(400, 128, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=28, bias=True)
)

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
from torch.autograd import Variable

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, emb_dim = 400, hidden_dim = 264, layers = 2, drop_prob = 0.5):
        super(SimpleLSTMBaseline, self).__init__() # don't forget to call this!
        # Input
        self.input_size = len(TEXT.vocab)
        # Classifier
        self.output_size = len(LABEL.vocab)
        # Stacked LSTM
        self.n_layers = layers
        # Hidden units
        self.hidden_dim = hidden_dim
        # Embedding dim
        self.embedding_dim = emb_dim
        self.train_gpu = torch.cuda.is_available()
        
        self.embedding = nn.Embedding(self.input_size, self.embedding_dim)
        #self.lstm = nn.LSTM(self.embedding_dim, hidden_dim, num_layers = self.n_layers, dropout = drop_prob,
        #                   batch_first = True)
        self.fc1 = nn.Linear(self.embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, hidden_dim)
        #self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, self.output_size)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.1)
        # Activation function
        #self.smax =  nn.Softmax(dim = 1)

    def forward(self, x):
        
        batch_size = x.size(0)
        #print('Batch size', batch_size)
        # embeddings and lstm_out
        x = x.long()
        x = self.embedding(x)
        #print('Embedding', x.shape)
        #lstm_out, (hidden, cell) = self.lstm(x)
        # remove sequence
        #x = torch.squeeze(x, 1)
        x = x[:, -1, :]
        #print('Removing sequence', x.shape)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        # add output layer
        #hidden = torch.cat((hidden[-2,:,:],hidden[-1,:,:]), dim = 1)
        #print('concat hidden', hidden)
        x = self.fc(x)
        #print('Final output', x.shape)

        return x

    

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

embedding_dim = 100
hidden_dim = 264
hidden_layers = 1
dropout_prob = 0.5
model = SimpleLSTMBaseline(emb_dim = embedding_dim, hidden_dim = hidden_dim, 
                             layers = hidden_layers, drop_prob = dropout_prob)
print(f'The model has {count_parameters(model):,} trainable parameters')
#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
model.to(device)

In [147]:
import tqdm

opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.CrossEntropyLoss()

counter = 0
print_every = 100

clip = 5 # gradient clipping
epochs = 10
model.train() # turn on training mode

for epoch in range(1, epochs + 1):
    # initialize hidden state
    h = model.init_hidden(batch_size)
    running_loss = 0.0
    running_corrects = 0
    for batch in train_iter:
        y = batch.c
        #x1 = batch.headline
        x = batch.s
        #x = torch.cat((x1, x2), 0) # Concatenating variables
        if x.shape[1] != batch_size:
            continue
        counter += 1
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        opt.zero_grad()
        #model.zero_grad()
        # get the output from the model
        output, h = model(x, h)
        
        # calculate the loss and perform backprop
        loss = loss_func(output, Variable(y))
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        opt.step()
        
        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for batch in val_iter:
                labels = batch.c
                #x1 = batch.headline
                inputs = batch.s
                if inputs.shape[1] != batch_size:
                    continue
                
                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                output, val_h = model(inputs, val_h)
                val_loss = loss_func(output.view(batch_size, -1), labels.view(batch_size, -1).squeeze())

                val_losses.append(val_loss.item())

            model.train()
            print("Epoch: {}/{}...".format(epoch + 1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


Epoch: 2/10... Step: 100... Loss: 2.340255... Val Loss: 2.616587
Epoch: 2/10... Step: 200... Loss: 2.891931... Val Loss: 2.624966
Epoch: 2/10... Step: 300... Loss: 2.910921... Val Loss: 2.604806
Epoch: 2/10... Step: 400... Loss: 2.501232... Val Loss: 2.662229
Epoch: 2/10... Step: 500... Loss: 2.530216... Val Loss: 2.740890
Epoch: 2/10... Step: 600... Loss: 2.602314... Val Loss: 2.754738
Epoch: 3/10... Step: 700... Loss: 2.264573... Val Loss: 2.760938
Epoch: 3/10... Step: 800... Loss: 2.853158... Val Loss: 2.859050
Epoch: 3/10... Step: 900... Loss: 2.428112... Val Loss: 2.790881
Epoch: 3/10... Step: 1000... Loss: 2.660060... Val Loss: 2.685615
Epoch: 3/10... Step: 1100... Loss: 2.315199... Val Loss: 2.705386
Epoch: 3/10... Step: 1200... Loss: 2.331552... Val Loss: 2.680952
Epoch: 3/10... Step: 1300... Loss: 2.574005... Val Loss: 2.731640
Epoch: 4/10... Step: 1400... Loss: 2.124772... Val Loss: 2.747995
Epoch: 4/10... Step: 1500... Loss: 2.668533... Val Loss: 2.715187
Epoch: 4/10... Step

## Not come here yet

In [383]:
# container for sentences
headline_arr = np.array([headline for headline in news_articles['headline']])
description_arr = np.array([headline for headline in news_articles['short_description']])
# Stack features
features = np.vstack((headline_arr, description_arr)).T
# container for labels
labels = np.array([label for label in news_articles['category']])

In [384]:
print(features.shape)
print(labels.shape)
print(features[0])

(8485, 2)
(8485,)
['will grace creator donate gay bunny book every grade school indiana'
 'lot easier kid mike pences home state read a day life marlon bundo']


In [385]:
# Enconde labels as 0, 1, 2..

# Label encoding news category
enc = LabelEncoder()
enc.fit(labels)
print(enc.classes_)
labels = enc.transform(labels)
# enc.inverse_transform([0, 0, 1, 2])
print(labels)
print(labels[0])

['ARTS & CULTURE' 'BLACK VOICES' 'BUSINESS' 'COLLEGE' 'COMEDY' 'CRIME'
 'EDUCATION' 'ENTERTAINMENT' 'GREEN' 'HEALTHY LIVING' 'IMPACT'
 'LATINO VOICES' 'MEDIA' 'PARENTS' 'POLITICS' 'QUEER VOICES' 'RELIGION'
 'SCIENCE' 'SPORTS' 'STYLE' 'TASTE' 'TECH' 'TRAVEL' 'WEIRD NEWS' 'WOMEN'
 'WORLD NEWS']
[15 15 15 ...  4  7 14]
15


In [386]:
# count unique elements
unique, counts = np.unique(labels, return_counts=True)
print(unique)
print(counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
[  13  406   85    1  443  170   31 1699   28   15   73   83  290   32
 3042  451   63   40  364   34    9   53   72  205  226  557]


In [387]:
# Find 3 and remove it to avoid errors
idx = np.where(labels == 3)
features = np.delete(features, idx, axis=0)
labels = np.delete(labels, idx)

In [388]:
unique, counts = np.unique(labels, return_counts=True)
print(unique)
print(counts)
print(features.shape)
print(labels.shape)

[ 0  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25]
[  13  406   85  443  170   31 1699   28   15   73   83  290   32 3042
  451   63   40  364   34    9   53   72  205  226  557]
(8484, 2)
(8484,)


In [410]:
from sklearn.preprocessing import OneHotEncoder

one_labels = OneHotEncoder().fit(labels.reshape(-1,1))
print(one_labels)

OneHotEncoder()
