<h3> Imports

In [1]:
import torch
import torch.nn.functional as F 

import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
import string
import nltk
import os
import random

In [2]:
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

<h3>Loading News Data

In [3]:
data = pd.read_csv('../Data/news-article-categories.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6877 entries, 0 to 6876
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  6877 non-null   object
 1   title     6877 non-null   object
 2   body      6872 non-null   object
dtypes: object(3)
memory usage: 161.3+ KB


In [5]:
data.head()

Unnamed: 0,category,title,body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...


In [6]:
data_sans_na = data.dropna()

<h3>Max text length

In [7]:
print("Longest length is:\n",data.body.str.len().max())

Longest length is:
 72850.0


In [8]:
MAX_LEN = 72850

<h3> Train / Test split

In [9]:
X = data_sans_na.drop(["category"], axis =1)
X.head()

Unnamed: 0,title,body
0,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...
3,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...


In [10]:
y = data_sans_na["category"]
y.head()

0    ARTS & CULTURE
1    ARTS & CULTURE
2    ARTS & CULTURE
3    ARTS & CULTURE
4    ARTS & CULTURE
Name: category, dtype: object

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [12]:
train_dataset = pd.concat([X_train, y_train], axis=1)
test_dataset = pd.concat([X_test, y_test], axis=1)

<h3> Genenal Settings

In [13]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 14

<h3> Tokenizer

In [14]:
tokenizer = get_tokenizer("basic_english")

In [15]:
test = tokenizer("This is a test, how are you Marcelino")

In [16]:
test

['this', 'is', 'a', 'test', ',', 'how', 'are', 'you', 'marcelino']

In [17]:
def build_vocabulary(datasets):
    for dataset in datasets:
        for _, title, body, category in dataset:
            print(body)
            #yield tokenizer(body)

In [18]:
# Define a function that takes in a sentence and returns the tokenized version
def tokenize_sentence(sentence):
  tokens = tokenizer(sentence)
  return tokens

In [19]:
train_dataset.head()

Unnamed: 0,title,body,category
3448,"In First Vote, Columbia College Students Back ...",73.7 percent of voting Columbia College studen...,ENVIRONMENT
6279,Facebook Is Developing A Camera App To Rival S...,(Reuters) - Facebook Inc is developing a stand...,TECH
2738,The Obamas Are Coming To Your Netflix Queue,Former President Barack Obama and first lady M...,ENTERTAINMENT
6678,When Sexual Harassment Extends From The Workpl...,After being inundated with the stories of the ...,WOMEN
3374,"Kitten Meets Bearded Dragon, Freaks Out Ninja-...",Bearded dragons were only introduced to the Un...,ENVIRONMENT


In [20]:
# Apply the function to each row in the dataframe
train_dataset['token'] = train_dataset['body'].apply(tokenize_sentence)
test_dataset['token'] = test_dataset['body'].apply(tokenize_sentence)

In [21]:
token_list = pd.concat([train_dataset['token'],test_dataset['token']])

In [22]:
vocab = build_vocab_from_iterator(token_list, min_freq=1, specials=["<UNK>"])

vocab.set_default_index(vocab["<UNK>"])

In [23]:
len(vocab)


114811

In [24]:
indexes = vocab(test)

test, indexes
#Last word Marcelino unknown so token is 0

(['this', 'is', 'a', 'test', ',', 'how', 'are', 'you', 'marcelino'],
 [22, 10, 7, 831, 1, 78, 19, 25, 0])

<h3> Target classes

In [25]:
target_classes = ["ART & CULTURE","BUSINESS","COMEDY","CRIME","EDUCATION","ENTERTAINMENT","ENVIRONMENT","MEDIA","POLITICS","RELIGION","SCIENCE","SPORTS","TECH","WOMEN"]

<h2> RNN

In [26]:
max_words = 25

def vectorize_batch(batch):
    Y, X = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y) - 1 ## We have deducted 1 from target names to get them in range [0,1,2,3] from [1,2,3,4]


train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset , batch_size=1024, collate_fn=vectorize_batch)

In [27]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

KeyError: 2879