<h1> Imports

In [627]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
import string
import nltk
import os
import random

In [628]:
# !pip install torchtext==0.10.0 --user

In [629]:
# !pip install torch==1.11.0 torchtext==0.12.0 --user

In [630]:
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

<h1>Import Data

In [631]:
data = pd.read_csv('../Data/news-article-categories-clean.csv')

data["cat"] = pd.Categorical(data.category)
data['cat_code'] = data.cat.cat.codes

data_sans_na = data.dropna()

<h1>Populate Vocabulary

In [632]:
tokenizer = get_tokenizer("basic_english")

data['tok_body'] = data.body.apply(lambda x: tokenizer(x))

vocab = build_vocab_from_iterator(data['tok_body'], min_freq=1, specials=["<UNK>"])

vocab.set_default_index(vocab["<UNK>"])

len(vocab)

66936

<h1>Target Classes

In [633]:
target_classes = ["ART & CULTURE","BUSINESS","COMEDY","CRIME","EDUCATION","ENTERTAINMENT","ENVIRONMENT","MEDIA","POLITICS","RELIGION","SCIENCE","SPORTS","TECH","WOMEN"]

<h1>Train / Test Split

In [634]:
X = data_sans_na.drop(["category", "cat_code", "cat"], axis =1)
# y = data_sans_na["category"]
y = data_sans_na["cat_code"]

In [635]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [636]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

<h1> Vectorize Batch

In [654]:
max_words = 25

def vectorize_batch(batch):
    Y = []
    X = []
    print(test)
    for row in batch:
        Y.append(row["label"])
        text = row["text"]
        text_tok = [vocab(tokenizer(text)) for text in text]
        text = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in text_tok] ## Bringing all samples to max_words length.
        X.append(text)
    return torch.tensor(X,dtype=torch.int32), torch.tensor(Y)

<h1> Load Data

In [655]:
# Définir une classe pour transformer un Dataframe en Dataset
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['body']
        # label = row['category']
        label = row['cat_code']
        return {'label': label, 'text': text}
        #return text, label

# Créer une instance de la classe Dataset personnalisée
train_dataset = CustomDataset(train)
test_dataset = CustomDataset(test)

# train_dataset, test_dataset  = to_map_style_dataset(train_dataset), to_map_style_dataset(test_dataset)

In [656]:
item1 = train_dataset[0]
item2 = train_dataset[2]

In [657]:
list = [item1,item2]

In [658]:
list[0]

{'label': 11,
 'text': 'spain scooped first winter olympics medal since thursday regino hern ndez mart bronze men snowboard cross event enough send eurosport commentator jos manuel hern ndez tallada mini meltdown ufffmomentazo eh jmtallada reginoherma eurosport dmax pic twitter com yo wyoqhqg sportscaster contain excitement countryman historic performance video eurosport colleague fernando ruiz shared online see race reached thrilling conclusion hern ndez tallada jumped punched air inside studio watching livestream game video since gone viral prior hern ndez mart medal winning run spain last ascended podium winter olympics year ago albertville france blanca fern ndez ochoa secured third spot woman alpine skiing event'}

In [659]:
train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset , batch_size=1024, collate_fn=vectorize_batch)

In [660]:
for i, (labels, data) in enumerate(train_loader):
    # afficher les dimensions des données et des étiquettes
    print(f"Batch {i}: data shape = {data.shape}, labels shape = {labels.shape}")
    
    # afficher quelques exemples de données et d'étiquettes
    print(f"Data example:\n{data[:2]}\nLabels example:\n{labels[:2]}")

                                                  title  \
4849                  Too Mormon, Or Not Mormon Enough?   
5307  Extensive Coral Reef Found Hidden At The Mouth...   
2244  I Am An Eagle Scout. I Would Join Girl Scouts ...   
1475           5 Times to Pick Up the Telephone Instead   
2498  This Bronx Educator Reminds Us Why Good Teache...   
...                                                 ...   
5425  Olympic Doping Whistleblower Fights Back Again...   
6474  Woman Claims Pastor Abused Her As A Teen. He C...   
6553             30 Important Things I've Learned By 30   
5298    Dolphins Chatter More When Solving Tricky Tasks   
1631  Trevor Noah Mercilessly Mocks 'Shady' Sean Han...   

                                                   body  cat_code  
4849  lot mormon tell mormon enough criticizing chur...         9  
5307  researcher discovered massive coral reef muddy...        10  
2244  boy scout america recently announced opening d...         4  
1475  version artic

ValueError: expected sequence of length 3231 at dim 1 (got 1566)

<h1> RNN

In [643]:
#Paramètres du modèle
EMBED_LEN = 50
HIDDEN_SIZE = 50
OUTPUT_SIZE = 2
N_LAYERS = 1

In [644]:
#Classe pour le modèle RNN
class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=EMBED_LEN)
        self.rnn = nn.RNN(input_size=EMBED_LEN, hidden_size=HIDDEN_SIZE, num_layers=N_LAYERS, batch_first=True)
        self.linear = nn.Linear(HIDDEN_SIZE, len(target_classes))

    def forward(self, X_batch):
        # h0 = torch.zeros(N_LAYERS, x.size(0), HIDDEN_SIZE).to(x.device)
        # out, hn = self.rnn(x, h0)
        # out = self.fc(out[:, -1, :])
        # return out
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings, torch.randn(N_LAYERS, len(X_batch), HIDDEN_SIZE))
        return self.linear(output[:,-1])

In [645]:
rnn_classifier = RNNClassifier()

rnn_classifier

RNNClassifier(
  (embedding_layer): Embedding(66936, 50)
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=14, bias=True)
)

In [646]:
for layer in rnn_classifier.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

Layer : Embedding(66936, 50)
Parameters : 
torch.Size([66936, 50])

Layer : RNN(50, 50, batch_first=True)
Parameters : 
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])

Layer : Linear(in_features=50, out_features=14, bias=True)
Parameters : 
torch.Size([14, 50])
torch.Size([14])



In [647]:
out = rnn_classifier(torch.randint(0, len(vocab), (1024, max_words)))

out.shape

torch.Size([1024, 14])

In [648]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [649]:
from torch.optim import Adam

epochs = 15
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
rnn_classifier = RNNClassifier()
optimizer = Adam(rnn_classifier.parameters(), lr=learning_rate)

TrainModel(rnn_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

  0%|          | 0/6 [00:38<?, ?it/s]


ValueError: expected sequence of length 662 at dim 1 (got 5836)