In [224]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [197]:
vectorizer_choice="CountVectorizer"
model_choice="Dense"

In [198]:
df=pd.read_csv("bbc-news-data.csv", sep="\t")


In [199]:
df.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
5,business,006.txt,Japan narrowly escapes recession,Japan's economy teetered on the brink of a te...
6,business,007.txt,Jobs growth still slow in the US,The US created fewer jobs than expected in Ja...
7,business,008.txt,India calls for fair trade rules,"India, which attends the G7 meeting of seven ..."
8,business,009.txt,Ethiopia's crop production up 24%,Ethiopia produced 14.27 million tonnes of cro...
9,business,010.txt,Court rejects $280bn tobacco case,A US government claim accusing the country's ...


In [200]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\\s]", " ", text)
    text = re.sub(r"\\s+", " ", text).strip()
    return text

In [201]:
df["clean_text"] = df["content"].fillna("").apply(clean_text)

le = LabelEncoder()
df["label_enc"] = le.fit_transform(df["category"])

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label_enc"], test_size=0.2, random_state=42, stratify=df["label_enc"]
)

In [202]:
def CountVectorizer_function():
    vectorizer = CountVectorizer(max_features=2000, ngram_range=(1,2))
    return (vectorizer)
def tfidf_function():
    vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
    return (vectorizer)

In [203]:
if vectorizer_choice=="CountVectorizer":
    vectorizer=CountVectorizer_function()
if vectorizer_choice=="Tfidf":
    vectorizer=tfidf_function()

In [204]:

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"ðŸ§® Shape train: {X_train_vec.shape}, test: {X_test_vec.shape}")

ðŸ§® Shape train: (1780, 2000), test: (445, 2000)


In [205]:
def LogisticRegression_function():
    model = LogisticRegression(max_iter=1000)
    return(model)
def mlp_function():
    model = MLPClassifier(hidden_layer_sizes=(100,100), activation='relu', max_iter=300, random_state=42)
    return (model)
def Dense_function():
    model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', max_iter=300, random_state=42)
    return (model)
def Svm_function():
    model = LinearSVC(max_iter=10000)
    return (model)
    

In [206]:
if model_choice== "LogisticRegression":
    model=LogisticRegression_function()
if model_choice=="mlp":
    model= mlp_function()
if model_choice=="Dense":
    model=Dense_function()
if model=="svm":
    model=Svm_function()
    

In [207]:
model.fit(X_train_vec, y_train)

In [208]:
y_pred = model.predict(X_test_vec)

In [209]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="macro")
rec = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

In [210]:
print("\n Result")
print(f"Accuracy       : {acc:.4f}")
print(f"Precision (avg): {prec:.4f}")
print(f"Recall (avg)   : {rec:.4f}")
print(f"F1-score (avg) : {f1:.4f}")

print(classification_report(y_test, y_pred, target_names=le.classes_))





 Result
Accuracy       : 0.9685
Precision (avg): 0.9689
Recall (avg)   : 0.9679
F1-score (avg) : 0.9681
               precision    recall  f1-score   support

     business       0.95      0.97      0.96       102
entertainment       0.95      0.99      0.97        77
     politics       0.97      0.92      0.94        84
        sport       0.98      0.99      0.99       102
         tech       0.99      0.97      0.98        80

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



In [211]:
example = "The European Central Bank (ECB) announced today a new set of measures aimed at curbing inflation in the Eurozone, which remains above the 2% target despite several successive interest rate hikes. According to ECB President Christine Lagarde, price pressures persist due to volatility in energy markets and rising raw material costs. Specifically, the ECB raised its key interest rate by 25 basis points, bringing it to 3%, a decision widely anticipated by financial markets. This measure is expected to increase the euroâ€™s attractiveness against the dollar and slow credit growth, while maintaining a delicate balance to avoid hampering economic activity. Analysts emphasize that the resilience of European banks to rising rates will be crucial. Financial institutions, facing a higher funding cost environment, may adjust lending to businesses and households, potentially impacting consumption and investment. Additionally, European stock markets reacted positively to the announcement, with a 1.2% average gain across major indices. Investors appear to be betting on the ECBâ€™s cautious approach, which could manage inflation without triggering a recession. Finally, the ECB indicated it will continue to closely monitor key economic indicators, such as unemployment, GDP growth, and wage pressures, to adjust its monetary policy accordingly. The coming months will be decisive in determining whether these measures are sufficient to stabilize prices without slowing economic recovery in the region."

In [212]:
vec = vectorizer.transform([clean_text(example)])
pred_label = le.inverse_transform(model.predict(vec))[0]
print(f"ðŸ”® '{example}' â†’ {pred_label}")


ðŸ”® 'The European Central Bank (ECB) announced today a new set of measures aimed at curbing inflation in the Eurozone, which remains above the 2% target despite several successive interest rate hikes. According to ECB President Christine Lagarde, price pressures persist due to volatility in energy markets and rising raw material costs. Specifically, the ECB raised its key interest rate by 25 basis points, bringing it to 3%, a decision widely anticipated by financial markets. This measure is expected to increase the euroâ€™s attractiveness against the dollar and slow credit growth, while maintaining a delicate balance to avoid hampering economic activity. Analysts emphasize that the resilience of European banks to rising rates will be crucial. Financial institutions, facing a higher funding cost environment, may adjust lending to businesses and households, potentially impacting consumption and investment. Additionally, European stock markets reacted positively to the announcement, with

In [215]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec

In [216]:
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
vocab = w2v_model.wv.key_to_index


embedding_dim = 100
vocab_size = len(vocab) + 1
embedding_matrix = torch.zeros((vocab_size, embedding_dim))
for word, i in vocab.items():
    embedding_matrix[i] = torch.tensor(w2v_model.wv[word])


In [222]:
def text_to_seq(text, vocab, max_len=200):
    seq = [vocab.get(w, 0) for w in text.split()]
    if len(seq) < max_len:
        seq += [0]*(max_len - len(seq))
    else:
        seq = seq[:max_len]
    return seq

class NewsDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=200):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        seq = text_to_seq(self.texts.iloc[idx], self.vocab, self.max_len)
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels.iloc[idx], dtype=torch.long)

In [218]:
train_dataset = NewsDataset(X_train, y_train, vocab)
test_dataset = NewsDataset(X_test, y_test, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [219]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_size, n_layers=1, bidirectional=True):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # on prend le dernier hidden state
        if self.lstm.bidirectional:
            out = torch.cat((lstm_out[:, -1, :self.lstm.hidden_size], lstm_out[:, 0, self.lstm.hidden_size:]), dim=1)
        else:
            out = lstm_out[:, -1, :]
        out = self.fc(out)
        return out

In [220]:
num_classes = len(le.classes_)
hidden_dim = 128
model = BiLSTMClassifier(embedding_matrix, hidden_dim, num_classes)

In [221]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for seqs, labels in train_loader:
        seqs, labels = seqs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(seqs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/5, Loss: 1.2430
Epoch 2/5, Loss: 0.8495
Epoch 3/5, Loss: 0.6786
Epoch 4/5, Loss: 0.5081
Epoch 5/5, Loss: 0.4236


In [223]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for seqs, labels in test_loader:
        seqs, labels = seqs.to(device), labels.to(device)
        outputs = model(seqs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
print(f"Test Accuracy: {correct/total:.4f}")

Test Accuracy: 0.8629
