In [None]:
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json

dataset = load_dataset("ag_news")
X_train = [x['text'] for x in dataset['train'].select(range(4000))]
y_train = [x['label'] for x in dataset['train'].select(range(4000))]
X_test = [x['text'] for x in dataset['test'].select(range(1000))]
y_test = [x['label'] for x in dataset['test'].select(range(1000))]

vectorizer = TfidfVectorizer(max_features=2000)
X_train_vec = vectorizer.fit_transform(X_train).astype(np.float32)
X_test_vec = vectorizer.transform(X_test).astype(np.float32)

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = NewsDataset(X_train_vec, y_train_enc)
test_ds = NewsDataset(X_test_vec, y_test_enc)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=128)

class NewsClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

input_dim = X_train_vec.shape[1]
num_classes = len(le.classes_)
model = NewsClassifier(input_dim, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)
print(f"Accuracy sur le test : {correct/total:.2%}")

import torch.onnx
dummy_input = torch.randn(1, input_dim, device=device)
onnx_path = "news_classifier.onnx"
torch.onnx.export(
    model, dummy_input, onnx_path,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}},
    opset_version=13
)
print(f"Modèle exporté en ONNX : {onnx_path}")

vocab = vectorizer.get_feature_names_out()
idf = vectorizer.idf_
with open("news_vectorizer.json", "w") as f:
    json.dump({
        "vocabulary": list(vocab),
        "idf": idf.tolist()
    }, f)
print("Vectorizer exporté en news_vectorizer.json")


Epoch 1 - Loss: 1.2333
Epoch 2 - Loss: 0.6748
Epoch 3 - Loss: 0.3878
Epoch 4 - Loss: 0.2862
Epoch 5 - Loss: 0.2276
Accuracy sur le test : 83.60%
Modèle exporté en ONNX : news_classifier.onnx
Vectorizer exporté en news_vectorizer.json
