In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
import time

# Transformation
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),  # Pour VGG, AlexNet, etc.
    transforms.Resize((224, 224)), 
    transforms.ToTensor()
])

# Chargement des données MNIST
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# ------------------------- Partie 1 : CNN Classifier ----------------------------
# ********** 1.1 CNN Custom **************

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Flatten(),
            nn.Linear(64*56*56, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.net(x)

model_cnn = CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001)


In [None]:
# ********** 1.2 Entraînement CNN ************

def train_model(model, name):
    start = time.time()
    model.train()
    for epoch in range(5):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"{name} Epoch {epoch+1}, Loss: {loss.item():.4f}")
    print(f"{name} Training Time: {time.time() - start:.2f}s")

train_model(model_cnn, "CNN")


In [None]:
# ********** 1.3 Évaluation CNN ***************

def evaluate(model, name):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1).cpu()
            y_true.extend(labels)
            y_pred.extend(preds)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"{name} - Accuracy: {acc:.4f}, F1: {f1:.4f}")

evaluate(model_cnn, "CNN")


In [None]:
# ********* 3. Fine-tuning VGG16 et AlexNet ************

def fine_tune_model(model_name):
    model = getattr(models, model_name)(pretrained=True)
    model.classifier[-1] = nn.Linear(model.classifier[-1].in_features, 10)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss()

    # Training
    model.train()
    for epoch in range(3):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"{model_name} Epoch {epoch+1}, Loss: {loss.item():.4f}")
    
    evaluate(model, model_name)

fine_tune_model("vgg16")
fine_tune_model("alexnet")


In [None]:
# -----------------------🔹 Partie 2 : Vision Transformer (ViT) from Scratch ---------------------------

# *************** 1. Implémentation ViT (extrait du tutoriel Medium) *****************

class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, emb_size=768, img_size=224):
        super().__init__()
        self.proj = nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))
        self.pos_embed = nn.Parameter(torch.randn((img_size // patch_size)**2 + 1, emb_size))

    def forward(self, x):
        B = x.shape[0]
        x = self.proj(x).flatten(2).transpose(1, 2)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embed
        return x

class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size=768, heads=8, dropout=0.1, forward_expansion=4):
        super().__init__()
        self.ln1 = nn.LayerNorm(emb_size)
        self.attn = nn.MultiheadAttention(embed_dim=emb_size, num_heads=heads)
        self.ln2 = nn.LayerNorm(emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size, forward_expansion * emb_size),
            nn.GELU(),
            nn.Linear(forward_expansion * emb_size, emb_size)
        )

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.ln1(x + attn_out)
        x = self.ln2(x + self.mlp(x))
        return x

class ViT(nn.Module):
    def __init__(self, img_size=224, patch_size=16, emb_size=768, num_classes=10, depth=6, heads=8):
        super().__init__()
        self.patch_embed = PatchEmbedding(in_channels=3, patch_size=patch_size, emb_size=emb_size, img_size=img_size)
        self.encoder = nn.Sequential(*[TransformerEncoderBlock(emb_size, heads) for _ in range(depth)])
        self.cls_head = nn.Linear(emb_size, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)
        x = self.encoder(x)
        cls_token = x[:, 0]
        return self.cls_head(cls_token)

model_vit = ViT().to(device)
optimizer = optim.Adam(model_vit.parameters(), lr=3e-4)



In [None]:
# 2. Entraînement & Évaluation ViT

train_model(model_vit, "ViT")
evaluate(model_vit, "ViT")
