In [1]:
import re
from argparse import Namespace
from functools import partial
from collections import Counter,OrderedDict


# torch
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader,default_collate
from torch.utils.data.backward_compatibility import worker_init_fn
## other
from torchdata import datapipes as dp
from torchtext import vocab
from torchmetrics import Accuracy

# manipulation
import numpy as np

# visualization 
import matplotlib.pyplot as plt 

# other 
from tqdm import tqdm       

In [2]:
args = Namespace(
    # data
    data_base_path = "../data/news/",
    data_filename = "news_with_splits.csv",
    dataset = ["train","val","test"],
    
    # vocab
    min_freq = 25,
    unk_tkn = "<UNK>",
    mask_tkn = "<MASK>",
    begin_seq_tkn = "<BEGIN>",
    end_seq_tkn = "<END>",
    
    # model
    model_base_path = "../models/news/",
    model_filename = "model.pth",
    glove_filepath = "../data/glove/glove.6B.100d.txt",
    hidden_dim = 100,
    num_channels = 100,
    
    # training
    num_epochs = 100,
    learning_rate = 0.001,
    dropout_rate = 0.1,
    batch_size = 128,
    early_stopping_criteria = 5,
    
    # runtime
    cuda = torch.cuda.is_available(),
    device = "cuda" if torch.cuda.is_available() else "cpu",    
)

# Datapipes

In [3]:
def classification_fn(args,row):
    return args.dataset.index(row[1])

def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([!\"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~])", r" ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def preprocess_row(row):
    return (preprocess_text(row[0]),preprocess_text(row[2]))

In [29]:
def build_pipe_dict(args):
    pipe = dp.iter.FileOpener([args.data_base_path+args.data_filename])
    pipe = pipe.parse_csv(skip_lines=1)
    pipes = pipe.demux(num_instances=len(args.dataset),
                      classifier_fn=partial(classification_fn,args),
                      buffer_size=1_000_000)
    return {k:pipe.map(preprocess_row)
            for k,pipe in zip(args.dataset,pipes)}

In [5]:
pipe_dict = build_pipe_dict(args)

In [6]:
def cat_token_fn(row):
    return [row[0]]

def title_token_fn(row):
    return row[1].split(" ")

In [7]:
def build_vocab_dict(train_pipe,args=args):
    cat_token = train_pipe.map(cat_token_fn)
    cat_vocab = vocab.build_vocab_from_iterator(cat_token)
    
    # title_token = train_pipe.map(title_token_fn)
    # title_vocab = vocab.build_vocab_from_iterator(title_token,
    #                                               min_freq=args.min_freq,
    #                                               specials=[args.unk_tkn,
    #                                                         args.mask_tkn,
    #                                                         args.begin_seq_tkn,
    #                                                         args.end_seq_tkn])
    max_length = -1
    counter = Counter()
    for row in pipe_dict["train"]:
        max_length = max(max_length,len(split:=row[1].split(" ")))
        counter.update(split)
        
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))    
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    title_vocab = vocab.vocab(ordered_dict=ordered_dict,min_freq=args.min_freq,
                              specials=[args.unk_tkn,
                                        args.mask_tkn,
                                        args.begin_seq_tkn,
                                        args.end_seq_tkn])
    title_vocab.set_default_index(title_vocab[args.unk_tkn])
    title_vocab.max_length = max_length
    freq = [count for _,count in sorted_by_freq_tuples]
    title_vocab.class_weights = 1.0 / torch.tensor(freq,dtype=torch.float32)
    
             
    return {"cat":cat_vocab,
            "title":title_vocab}

In [8]:
vocab_dict = build_vocab_dict(pipe_dict["train"],args)
vocab_dict



{'cat': Vocab(), 'title': Vocab()}

In [9]:
vocab_dict["title"].max_length

22

In [10]:
def vectorize(title,vocab,args=args):
    indices = [vocab[args.begin_seq_tkn]]
    indices.extend(vocab.lookup_indices(title.split(" ")))
    indices.append(vocab[args.end_seq_tkn])
    
    vector = np.zeros(shape=vocab.max_length+2,dtype=np.int64)
    vector[:(a:=len(indices))] = indices
    vector[a:] = vocab[args.mask_tkn]
    
    return vector


def create_dataset(vocab_dict,args,row):
    title_vector = vectorize(row[1],vocab_dict["title"],args)
    cat_index = vocab_dict["cat"][row[0]]
    return {"x":title_vector,
            "y":cat_index}

In [11]:
def build_dataset_dict(pipe_dict,vocab_dict,args):
    dataset_dict = {}
    fn = partial(create_dataset,vocab_dict,args)
    for name,pipe in pipe_dict.items():
        if name == "train":
            pipe = pipe.shuffle()
        pipe = pipe.map(fn)
        pipe = pipe.batch(args.batch_size)
        dataset_dict[name] = pipe
    return dataset_dict

In [12]:
dataset_dict = build_dataset_dict(pipe_dict,vocab_dict,args)

In [13]:
def collate_fn(args,x):
    return {k:v.to(args.device)
            for x_ in default_collate(x)
            for k ,v in x_.items()}

In [14]:
def generate_batches(dataset,shuffle=False,args=args):
    dataloader = DataLoader(dataset=dataset,
                            batch_size=args.batch_size,
                            shuffle=shuffle,
                            worker_init_fn=worker_init_fn,
                            collate_fn = partial(collate_fn,args),
                            drop_last=True)
    for batch in dataloader:
        yield batch

In [15]:
sample = iter(generate_batches(dataset_dict["train"],
                               shuffle=True))

In [16]:
next(sample)

{'x': tensor([[   2,   69,  918,  ...,    1,    1,    1],
         [   2,  868,    0,  ...,    1,    1,    1],
         [   2,   27, 3257,  ...,    1,    1,    1],
         ...,
         [   2,  826, 2231,  ...,    1,    1,    1],
         [   2,   27,    8,  ...,    1,    1,    1],
         [   2,  343,    0,  ...,    1,    1,    1]]),
 'y': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0])}

# Models

## model helper functions

In [17]:
def load_glove_from_file(glove_filepath):
    word_to_idx = {}
    embedding_matrix = []
    with open(glove_filepath,"r") as f:
        for idx,line in enumerate(f):
            line = line.split(" ")
            word_to_idx[line[0]] = idx
            embedding = [float(val) for val in line[1:]]
            embedding_matrix.append(embedding)
    return word_to_idx,np.stack(embedding_matrix)

def create_embedding(glove_filepath,title_vocab):
    word_to_idx , pre_trained_embedding = load_glove_from_file(glove_filepath)
    embedding_size = pre_trained_embedding.shape[1]
    
    final_embeddings = np.zeros(shape=(len(title_vocab),embedding_size))
    
    for idx,word in enumerate(title_vocab.get_itos()):
        if word in word_to_idx:
            final_embeddings[idx,:] = pre_trained_embedding[word_to_idx[word]]
        else:
            embedding_idx = torch.ones(1,embedding_size)
            nn.init.xavier_uniform_(embedding_idx)
            final_embeddings[idx,:] = embedding_idx
    
    return final_embeddings
    

In [18]:
class NewsClassifier(nn.Module):
    def __init__(self,embedding_size,num_embedding,
                 num_class,num_channels,hidden_dim,
                 dropout_rate,pretrained_embedding=None,padding_idx = 0) -> None:
        super().__init__()
        if pretrained_embedding is not None:
            weights = torch.from_numpy(pretrained_embedding).float()
            self.embedding = nn.Embedding.from_pretrained(embeddings=weights,
                                                          padding_idx=padding_idx)
        else:
            self.embedding = nn.Embedding(num_embeddings=num_embedding,
                                          embedding_dim=embedding_size,
                                          padding_idx=padding_idx)
            
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size,
                      out_channels=num_channels,kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,
                      out_channels=num_channels,kernel_size=3,stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,
                      out_channels=num_channels,kernel_size=3,stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,
                      out_channels=num_channels,kernel_size=3),
            nn.ELU(),
        )
        
        self.dropout_rate = dropout_rate
        self.fc1 = nn.Linear(in_features=num_channels,
                             out_features=hidden_dim)
        self.fc2 = nn.Linear(in_features=hidden_dim,
                             out_features=num_class)
        
    def forward(self,input,apply_softmax=False):
        # input.shape [batch,max_seq_length]
        embed = self.embedding(input)
        # shape [batch,max_seq_length,embedding]
        #? making the embedding as channels
        embed = embed.permute(0,2,1)
        # shape [batch,embedding,max_seq_length]
        features = self.convnet(embed)
        # shape [batch,num_channels,remain_size]
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(input=features,kernel_size=remaining_size).squeeze(dim=2)
        features = F.dropout(features,p=self.dropout_rate)
        # shape [batch,num_channel]
        intermediate_vector = F.relu(F.dropout(self.fc1(features),self.dropout_rate))
        # shape [batch.hidden_dim]
        prediction_vector = self.fc2(intermediate_vector)
        # shape [batch,num_classes]
        
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector,dim=1)
            
        return prediction_vector            

In [19]:
sample_batch = next(iter(generate_batches(dataset_dict["train"])))["x"]

In [20]:
embeddings = create_embedding(args.glove_filepath,vocab_dict["title"])

In [21]:
classifier = NewsClassifier(embedding_size=embeddings.shape[1],
                            num_embedding=embeddings.shape[0],
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim,
                            num_class=len(vocab_dict["cat"]),
                            pretrained_embedding=embeddings,
                            dropout_rate=args.dropout_rate)

In [22]:
classifier(sample_batch).shape

torch.Size([128, 4])

# Training

## helper function

In [23]:
def make_train_state(args=args):
    return {"stop_early":False,
            "early_stopping_step":0,
            "early_stopping_val":1e8,
            "epoch_index":0,
            "model_filepath":args.model_base_path+args.model_filename,
            "train_loss":[],
            "train_acc":[],
            "val_loss":[],
            "val_acc":[],
            "test_loss":-1,
            "test_acc":-1}

def update_train_state(train_state,model,args=args):
    if train_state["epoch_index"] == 0:
        torch.save(model.state_dict(),train_state["model_filepath"])
        train_state["stop_early"] = False
    
    elif train_state["epoch_index"] >= 1:
        loss_tm1 , loss_t = train_state["val_loss"][-2:]
        if loss_t >= train_state["early_stopping_val"]:
            train_state["early_stopping_step"] +=1
        else:
            torch.save(model.state_dict(),train_state["model_filepath"])
            train_state["early_stopping_step"] = 0
            
        train_state["stop_early"] = train_state["early_stopping_step"] >= args.early_stopping_criteria
    
    return train_state

## initialize

In [30]:
# data
pipe_dict = build_pipe_dict(args)
vocab_dict = build_vocab_dict(pipe_dict["train"],args=args)
dataset_dict = build_dataset_dict(pipe_dict=pipe_dict,vocab_dict=vocab_dict,args=args)


In [None]:
# model
embeddings = create_embedding(args.glove_filepath,vocab_dict["title"])
classifier = NewsClassifier(embedding_size=embeddings.shape[1],
                            num_embedding=len(vocab_dict["title"]),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim,
                            num_class=len(vocab_dict["cat"]),
                            pretrained_embedding=embeddings,
                            dropout_rate=args.dropout_rate).to(args.device)
# training
loss_fn = nn.CrossEntropyLoss()
acc_fn = Accuracy(task="multiclass",num_classes=len(vocab_dict["cat"])).to(args.device)
optimizer = optim.Adam(params=classifier.parameters(),
                       lr= args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode="min",factor=0.5,patience=1)
train_state = make_train_state(args)

## training loop

In [25]:
for epoch_index in tqdm(range(args.num_epochs)):
    train_state["epoch_index"] = epoch_index
    
    # get the data
    batch_generator = generate_batches(dataset=dataset_dict["train"],shuffle=True)
    classifier.train()
    running_loss = 0.0
    running_acc = 0.0
    
    for batch_idx,batch_dict in enumerate(batch_generator):
        classifier.zero_grad()
        logits = classifier(batch_dict["x"])
        
        # loss
        loss = loss_fn(logits,batch_dict["y"])
        loss_t = loss.item()
        running_loss = (loss_t - running_loss) /(batch_idx+1)
        
        # acc
        acc = acc_fn(logits,batch_dict["y"])
        acc_t = acc.item()
        running_acc = (acc_t - running_acc) / (batch_idx+1)
        
        loss.backward()
        optimizer.step()
        
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    
    # iterate over the val 
    batch_generator = generate_batches(dataset_dict["val"])
    running_loss = 0.0
    running_acc = 0.0
    classifier.eval()
    
    for batch_idx,batch_dict in enumerate(batch_generator):
        with torch.inference_mode():
            logits = classifier(batch_dict["x"])
            
            # loss
            loss = loss_fn(logits,batch_dict["y"])
            loss_t = loss.item()
            running_loss = (loss_t - running_loss) /(batch_idx+1)
            
            # acc
            acc = acc_fn(logits,batch_dict["y"])
            acc_t = acc.item()
            running_acc = (acc_t - running_acc) / (batch_idx+1)   
            
    train_state["val_loss"].append(running_loss)         
    train_state["val_acc"].append(running_acc)
    
    scheduler.step(train_state["val_loss"][-1])
    
    if train_state["stop_early"]:
        break          

100%|██████████| 100/100 [10:38<00:00,  6.38s/it]


## testing loop

In [31]:
batch_generator = generate_batches(dataset_dict["test"])
classifier.eval()
running_acc = 0.0 
running_loss = 0.0

for batch_idx,batch_dict in enumerate(batch_generator):
    with torch.inference_mode():
        logits = classifier(batch_dict["x"])
        # loss
        loss = loss_fn(logits,batch_dict["y"])
        loss_t = loss.item()
        running_loss = (loss_t - running_loss) /(batch_idx+1)
        
        # acc
        acc = acc_fn(logits,batch_dict["y"])
        acc_t = acc.item()
        running_acc = (acc_t - running_acc) / (batch_idx+1) 
        
train_state["test_loss"]= running_loss
train_state["test_acc"] = running_acc

In [32]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.6771782040596008;
Test Accuracy: 0.78125


# Inference

In [39]:
def predict_category(title,classifier,vocab_dict):
    title = preprocess_text(title)
    vectorized = torch.tensor(vectorize(title,vocab_dict["title"]))
    results = classifier(vectorized.unsqueeze(0),apply_softmax=True)
    prob,indices = results.max(dim=1)
    pred_cat = vocab_dict["cat"].lookup_token(indices.item())
    return pred_cat,prob.item()

In [40]:
text = """Shape and Dynamic Nature of Carbon-Based Molecules Are Different Than Scientists Thought"""
predict_category(text,classifier,vocab_dict)

('sci tech', 0.7965487837791443)

In [41]:
import feedparser

feed = feedparser.parse("https://scitechdaily.com/feed/")

In [50]:
for entry in feed.entries:
    print(entry.title)
    print(predict_category(entry.title,classifier,vocab_dict))

Shape and Dynamic Nature of Carbon-Based Molecules Are Different Than Scientists Thought
('sci tech', 0.7157201170921326)
Sleep and Vaccination: The Critical Connection You Need to Know About
('sci tech', 0.6424444317817688)
Seek & Destroy: Black Widow Spiders Are Being Actively Hunted by Brown Widows
('sci tech', 0.454619824886322)
Scientists Warn: Food Coloring Nanoparticles May Damage Human Gut
('sci tech', 0.7301842570304871)
Scientists Discover Previously Unknown Anatomical Structure in the Brain
('sci tech', 0.8048786520957947)
Nanofiber-Hydrogel Shows Success Treating Severe Complication of Crohn’s Disease
('sci tech', 0.5179975628852844)
Digital Rectal Exams Are NOT Useful To Early Detect Prostate Cancers
('sci tech', 0.9008622765541077)
Beware of Fungi in Flour: It Won’t Turn You Into a Zombie, but It Can Make You Sick
('sports', 0.8400651216506958)
Resisting Treatment: Cancer Cells Shrink or Super-Size To Survive
('sci tech', 0.8618530631065369)
('sci tech', 0.483915984630584

In [54]:
sports = feedparser.parse("https://sports.ndtv.com/rss/cricket")
for entry in sports.entries[:15]:
    print(entry.title)
    print(predict_category(entry.title,classifier,vocab_dict))

DC vs RCB, Women's Premier League, Live Score Updates: Match Hangs In Balance As Delhi Capitals Go 4 Down In Chase
('sports', 0.9560973644256592)
Watch: How New Zealand Helped India Reach World Test Championship Final After Beating Sri Lanka In Last-Ball Thriller
('sports', 0.6674612164497375)
"Madness...": Steve Smith On How Delhi And Ahmedabad Tests Were Different
('sports', 0.5860716104507446)
How Bad Was Shreyas Iyer's Injury? Rohit Sharma Shares Grave Observation
('sports', 0.7295311093330383)
"I Was Called Black Monkey": Mohammed Siraj Recalls Racism Incident On Australia Tour
('sports', 0.3564854860305786)
"Don't Believe What You See On Social Media": Rohit Sharma Provides Update On Virat Kohli's Health
('sci tech', 0.6168894171714783)
Watch: Virat Kohli Teases Nitin Menon Over LBW Decision. Umpire's Reaction Goes Viral
('sports', 0.4983079731464386)
Will IPL Schedule Impact India's WTC Final Chances? Rahul Dravid's Clear Answer
('sports', 0.5032055974006653)
"Test Cricket Is Ha