<h1> Imports

In [954]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
import string
import nltk
import os
import random

In [955]:
# !pip install torchtext==0.10.0 --user

In [956]:
# !pip install torch==1.11.0 torchtext==0.12.0 --user

In [957]:
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

<h1>Import Data

In [958]:
max_words = 100

In [959]:
data = pd.read_csv('../Data/news-article-categories-clean.csv')

<h1> Vocabulary

In [960]:
tokenizer = get_tokenizer("basic_english")

data['tok_body'] = data.body.apply(lambda x: tokenizer(x))

vocab = build_vocab_from_iterator(data['tok_body'], min_freq=1, specials=["<UNK>"])

vocab.set_default_index(vocab["<UNK>"])

len(vocab)

66936

<h1> Tokenization

In [961]:
data["cat"] = pd.Categorical(data.category)
data['cat_code'] = data.cat.cat.codes

In [962]:
data["body_tok"] = data["body"].apply(lambda x: vocab(tokenizer(x)))

In [963]:
data["body_max_len"] = data["body_tok"].apply(lambda x: x+([0]* (max_words-len(x))) if len(x)<max_words else x[:max_words])

In [964]:
data.head()

Unnamed: 0,category,title,body,tok_body,cat,cat_code,body_tok,body_max_len
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,october carolyn kramer received disturbing pho...,"[october, carolyn, kramer, received, disturbin...",ARTS & CULTURE,0,"[1298, 8062, 5640, 555, 2648, 404, 151, 120, 4...","[1298, 8062, 5640, 555, 2648, 404, 151, 120, 4..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,week talked actor jeff hiller hit broadway pla...,"[week, talked, actor, jeff, hiller, hit, broad...",ARTS & CULTURE,0,"[58, 1515, 575, 1825, 23777, 531, 1370, 161, 2...","[58, 1515, 575, 1825, 23777, 531, 1370, 161, 2..."
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,new yorker taking president donald trump asked...,"[new, yorker, taking, president, donald, trump...",ARTS & CULTURE,0,"[8, 2833, 366, 34, 174, 12, 199, 9, 1646, 613,...","[8, 2833, 366, 34, 174, 12, 199, 9, 1646, 613,..."
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,kellen hickey year old life hudson wisconsin g...,"[kellen, hickey, year, old, life, hudson, wisc...",ARTS & CULTURE,0,"[37533, 10231, 3, 90, 24, 8104, 3072, 1278, 84...","[37533, 10231, 3, 90, 24, 8104, 3072, 1278, 84..."
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,something combining traditional uptight look r...,"[something, combining, traditional, uptight, l...",ARTS & CULTURE,0,"[81, 5485, 941, 33242, 95, 6082, 799, 933, 25,...","[81, 5485, 941, 33242, 95, 6082, 799, 933, 25,..."


In [965]:
data_sans_na = data.dropna()
data_sans_na.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   category      6871 non-null   object  
 1   title         6871 non-null   object  
 2   body          6871 non-null   object  
 3   tok_body      6871 non-null   object  
 4   cat           6871 non-null   category
 5   cat_code      6871 non-null   int8    
 6   body_tok      6871 non-null   object  
 7   body_max_len  6871 non-null   object  
dtypes: category(1), int8(1), object(6)
memory usage: 336.3+ KB


<h1>Target Classes

In [966]:
target_classes = ["ART & CULTURE","BUSINESS","COMEDY","CRIME","EDUCATION","ENTERTAINMENT","ENVIRONMENT","MEDIA","POLITICS","RELIGION","SCIENCE","SPORTS","TECH","WOMEN"]

In [967]:
target_classes_int = [i for i in range(14)]

<h1>Train / Test Split

In [968]:
X = data_sans_na.drop(["category", "cat_code", "cat"], axis =1)
y = data_sans_na["cat_code"]

In [969]:
X.head()

Unnamed: 0,title,body,tok_body,body_tok,body_max_len
0,Modeling Agencies Enabled Sexual Predators For...,october carolyn kramer received disturbing pho...,"[october, carolyn, kramer, received, disturbin...","[1298, 8062, 5640, 555, 2648, 404, 151, 120, 4...","[1298, 8062, 5640, 555, 2648, 404, 151, 120, 4..."
1,Actor Jeff Hiller Talks “Bright Colors And Bol...,week talked actor jeff hiller hit broadway pla...,"[week, talked, actor, jeff, hiller, hit, broad...","[58, 1515, 575, 1825, 23777, 531, 1370, 161, 2...","[58, 1515, 575, 1825, 23777, 531, 1370, 161, 2..."
2,New Yorker Cover Puts Trump 'In The Hole' Afte...,new yorker taking president donald trump asked...,"[new, yorker, taking, president, donald, trump...","[8, 2833, 366, 34, 174, 12, 199, 9, 1646, 613,...","[8, 2833, 366, 34, 174, 12, 199, 9, 1646, 613,..."
3,Man Surprises Girlfriend By Drawing Them In Di...,kellen hickey year old life hudson wisconsin g...,"[kellen, hickey, year, old, life, hudson, wisc...","[37533, 10231, 3, 90, 24, 8104, 3072, 1278, 84...","[37533, 10231, 3, 90, 24, 8104, 3072, 1278, 84..."
4,This Artist Gives Renaissance-Style Sculptures...,something combining traditional uptight look r...,"[something, combining, traditional, uptight, l...","[81, 5485, 941, 33242, 95, 6082, 799, 933, 25,...","[81, 5485, 941, 33242, 95, 6082, 799, 933, 25,..."


In [970]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: cat_code, dtype: int8

In [971]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [972]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

<h1> Vectorize Batch

In [973]:
# def vectorize_batch(batch):
#     Y = []
#     X = []
#     for item in batch:
#         Y.append(item["label"])
#         text = item["text"]
#         text_tok = [vocab(tokenizer(text)) for text in text]
#         text = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in text_tok] ## Bringing all samples to max_words length.
#         X.append(text)
#     return torch.tensor(X,dtype=torch.int32), torch.tensor(Y)

In [974]:
# max_words = 25

# def vectorize_batch(batch):
#     Y, X = list(zip(*batch))
#     X = [vocab(tokenizer(text)) for text in X]
#     X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X]
#     return torch.tensor(X,dtype=torch.int32), torch.tensor(Y)

In [975]:
def vectorize_batch(batch):
    # print("batch type :", type(batch))
    # print(len(batch))
    # print(batch[0])
    # print(len(batch[0]))
    Y = tuple(map(lambda x: x["label"], batch))
    X = tuple(map(lambda x: x["text"], batch))
    X_t = torch.tensor(X,dtype=torch.long)
    Y_t = torch.tensor(Y, dtype=torch.long)
    # print(X_t)
    # print(Y_t)
    #Y_t = Y_t.unsqueeze(1)
    return X_t, Y_t
   

In [976]:
tuple1 = tuple(map(lambda x: x["label"], liste))
tuple2 = tuple(map(lambda x: x["text"], liste))

<h1> Load Data

In [977]:
# Définir une classe pour transformer un Dataframe en Dataset
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['body_max_len']
        label = row['cat_code']
        return {'label': label, 'text': text}

# Créer une instance de la classe Dataset personnalisée
train_dataset = CustomDataset(train)
test_dataset = CustomDataset(test)

train_dataset, test_dataset  = to_map_style_dataset(train_dataset), to_map_style_dataset(test_dataset)

In [978]:
# train_dataset = train[["body_max_len", "cat_code"]]
# test_dataset= test[["body_max_len", "cat_code"]]

In [979]:
train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn= vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset , batch_size=1024, collate_fn=vectorize_batch)

In [980]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

torch.Size([1024, 100]) torch.Size([1024])


<h1> RNN

In [981]:
#Paramètres du modèle
EMBED_LEN = 50
HIDDEN_SIZE = 50
OUTPUT_SIZE = len(target_classes_int)
N_LAYERS = 1

In [982]:
#Classe pour le modèle RNN
class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=EMBED_LEN)
        self.rnn = nn.RNN(input_size=EMBED_LEN, hidden_size=HIDDEN_SIZE, num_layers=N_LAYERS, batch_first=True)
        self.linear = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings, torch.randn(N_LAYERS, len(X_batch), HIDDEN_SIZE))
        return self.linear(output[:,-1])

In [983]:
rnn_classifier = RNNClassifier()

rnn_classifier

RNNClassifier(
  (embedding_layer): Embedding(66936, 50)
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=14, bias=True)
)

In [984]:
for layer in rnn_classifier.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

Layer : Embedding(66936, 50)
Parameters : 
torch.Size([66936, 50])

Layer : RNN(50, 50, batch_first=True)
Parameters : 
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])

Layer : Linear(in_features=50, out_features=14, bias=True)
Parameters : 
torch.Size([14, 50])
torch.Size([14])



In [985]:
out = rnn_classifier(torch.randint(0, len(vocab), (1024, max_words)))

out.shape

torch.Size([1024, 14])

In [986]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def calc_val_loss_and_accuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))


def train_model(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        calc_val_loss_and_accuracy(model, loss_fn, val_loader)

In [987]:
# model = rnn_classifier
# loss_fn = nn.CrossEntropyLoss()
# for X, Y in train_loader:
#     Y_preds = model(X)
#     print(Y_preds.type)
#     print(Y.type())
#     loss = loss_fn(Y_preds, Y)
#     # losses.append(loss.item())

#     # optimizer.zero_grad()
#     # loss.backward()
#     # optimizer.step()

In [988]:
# # Example of target with class indices
# loss = nn.CrossEntropyLoss()
# input = torch.randn(3, 5, requires_grad=True)
# print(input)
# target = torch.empty(3, dtype=torch.long).random_(5)
# print(target)
# output = loss(input, target)
# output.backward()
# print(output)

# # Example of target with class probabilities
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.randn(3, 5).softmax(dim=1)
# output = loss(input, target)
# output.backward()

In [989]:
from torch.optim import Adam

epochs = 15
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
rnn_classifier = RNNClassifier()
optimizer = Adam(rnn_classifier.parameters(), lr=learning_rate)

In [990]:
train_model(rnn_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|██████████| 6/6 [00:00<00:00,  7.93it/s]


Train Loss : 2.660
Valid Loss : 2.636
Valid Acc  : 0.072


100%|██████████| 6/6 [00:00<00:00,  7.73it/s]


Train Loss : 2.604
Valid Loss : 2.606
Valid Acc  : 0.097


100%|██████████| 6/6 [00:00<00:00,  7.44it/s]


Train Loss : 2.570
Valid Loss : 2.588
Valid Acc  : 0.100


100%|██████████| 6/6 [00:00<00:00,  7.14it/s]


Train Loss : 2.542
Valid Loss : 2.576
Valid Acc  : 0.114


100%|██████████| 6/6 [00:00<00:00,  6.84it/s]


Train Loss : 2.519
Valid Loss : 2.564
Valid Acc  : 0.121


100%|██████████| 6/6 [00:00<00:00,  7.25it/s]


Train Loss : 2.489
Valid Loss : 2.551
Valid Acc  : 0.131


100%|██████████| 6/6 [00:00<00:00,  7.00it/s]


Train Loss : 2.472
Valid Loss : 2.542
Valid Acc  : 0.142


100%|██████████| 6/6 [00:00<00:00,  7.30it/s]


Train Loss : 2.458
Valid Loss : 2.539
Valid Acc  : 0.157


100%|██████████| 6/6 [00:00<00:00,  7.41it/s]


Train Loss : 2.440
Valid Loss : 2.536
Valid Acc  : 0.160


100%|██████████| 6/6 [00:00<00:00,  7.01it/s]


Train Loss : 2.426
Valid Loss : 2.535
Valid Acc  : 0.161


100%|██████████| 6/6 [00:00<00:00,  7.65it/s]


Train Loss : 2.414
Valid Loss : 2.534
Valid Acc  : 0.160


100%|██████████| 6/6 [00:00<00:00,  7.17it/s]


Train Loss : 2.404
Valid Loss : 2.533
Valid Acc  : 0.159


100%|██████████| 6/6 [00:00<00:00,  7.51it/s]


Train Loss : 2.393
Valid Loss : 2.533
Valid Acc  : 0.159


100%|██████████| 6/6 [00:00<00:00,  7.55it/s]


Train Loss : 2.380
Valid Loss : 2.533
Valid Acc  : 0.155


100%|██████████| 6/6 [00:00<00:00,  7.38it/s]

Train Loss : 2.374
Valid Loss : 2.533
Valid Acc  : 0.156





In [991]:
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)
    gc.collect()
    Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

    return Y_shuffled.detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).detach().numpy()

Y_actual, Y_preds = MakePredictions(rnn_classifier, test_loader)

In [992]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

Test Accuracy : 0.15636363636363637

Classification Report : 
               precision    recall  f1-score   support

ART & CULTURE       0.15      0.70      0.25       193
     BUSINESS       0.06      0.02      0.03        91
       COMEDY       0.32      0.69      0.43        71
        CRIME       0.00      0.00      0.00        67
    EDUCATION       0.13      0.15      0.14       109
ENTERTAINMENT       0.12      0.02      0.04        92
  ENVIRONMENT       0.08      0.01      0.02        85
        MEDIA       0.00      0.00      0.00        66
     POLITICS       0.00      0.00      0.00       112
     RELIGION       0.05      0.02      0.03        90
      SCIENCE       0.14      0.01      0.02        87
       SPORTS       0.00      0.00      0.00       108
         TECH       0.09      0.02      0.03       101
        WOMEN       0.09      0.05      0.06       103

     accuracy                           0.16      1375
    macro avg       0.09      0.12      0.08      1375
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
