In [73]:
# Import des bibliothèques nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer



import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.base import BaseEstimator, TransformerMixin

# Classe personnalisée pour appliquer LabelEncoder à plusieurs colonnes
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
        
    def fit(self, X, y=None):
        for column in X.columns:
            le = LabelEncoder()
            le.fit(X[column].astype(str))
            self.encoders[column] = le
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for column in X.columns:
            X_transformed[column] = self.encoders[column].transform(X[column].astype(str))
        return X_transformed

In [74]:
# Chargement et prétraitement:

# Chargement des données nettoyées
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')



# Sélection des features
numeric_features = ['max_power', 'weight_min', 'weight_max', 'urb_cons', 
                    'exturb_cons', 'overall_cons', 'co', 'hc', 'nox', 'hcnox', 'ptcl']
categorical_features = ['brand', 'car_class', 'range', 'fuel_type', 'hybrid', 'grbx_type_ratios']

# Création des transformers
# remplace les valeurs manquantes par la moyenne puis normalise les données
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# remplace les valeurs manquantes par "missing" et convertit les catégories en nombres
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Création du preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [75]:
# Jeux de données

X_train = train_df[numeric_features + categorical_features]
y_train = train_df['co2']  # Assurez-vous que 'co2' est le bon nom de votre cible
X_test = test_df[numeric_features + categorical_features]

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

cat_columns = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
feature_names = numeric_features + list(cat_columns)

X_train = pd.DataFrame(X_train, columns=feature_names)
X_test = pd.DataFrame(X_test, columns=feature_names)

X_train_nn, X_val, y_train_nn, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [None]:

NUM_EPOCHS = 200
BATCH_SIZE = 32
LR = 1e-3
WEIGHT_DECAY = 1e-6

In [77]:
"""
Réseau de neurones avec PyTorch pour la régression:
"""

# Vérifier si CUDA est disponible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de: {device}")

# Créer un dataset PyTorch personnalisé
class CO2Dataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        
        if y is not None:
            self.y = torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1)
            self.has_target = True
        else:
            self.has_target = False
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_target:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

# Définition du modèle de réseau de neurones
class CO2RegressionNet(nn.Module):
    def __init__(self, input_dim):
        super(CO2RegressionNet, self).__init__()
        
        # Architecture du réseau
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.model(x)


# Création des datasets et dataloaders
train_dataset = CO2Dataset(X_train_nn, y_train_nn)
val_dataset = CO2Dataset(X_val, y_val)
test_dataset = CO2Dataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)



Utilisation de: cpu


In [78]:
# Initialisation du modèle, fonction de perte et optimiseur
input_dim = X_train.shape[1]
model = CO2RegressionNet(input_dim).to(device)
criterion = nn.MSELoss()  # Mean Squared Error pour la régression
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

In [79]:
# Fonction d'entraînement
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100, early_stopping_patience=15):
    train_losses = []
    val_losses = []
    
    best_val_loss = float('inf')
    best_model_state = None
    no_improve_epochs = 0
    
    for epoch in range(num_epochs):
        # Mode entraînement
        model.train()
        running_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Remise à zéro des gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass et optimisation
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * X_batch.size(0)
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)
        
        # Mode évaluation
        model.eval()
        running_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                
                running_loss += loss.item() * X_batch.size(0)
        
        epoch_val_loss = running_loss / len(val_loader.dataset)
        val_losses.append(epoch_val_loss)
        
        # # Ajustement du learning rate
        # scheduler.step(epoch_val_loss)
        
        # Affichage des métriques
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
        
        # Sauvegarde du meilleur modèle
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = model.state_dict().copy()
    
    # Chargement du meilleur modèle
    model.load_state_dict(best_model_state)
    
    return model, train_losses, val_losses


In [None]:
# Entraînement du modèle
trained_model, train_losses, val_losses = train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS, early_stopping_patience=15
)

# Graphique des pertes
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Courbes d\'apprentissage')
plt.legend()
plt.grid(True)
plt.savefig('nn_training_loss.png')
plt.close()

# Évaluation sur l'ensemble de validation
model.eval()
val_predictions = []
val_targets = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        val_predictions.extend(outputs.cpu().numpy())
        val_targets.extend(y_batch.numpy())

val_predictions = np.array(val_predictions).flatten()
val_targets = np.array(val_targets).flatten()

val_mae = mean_absolute_error(val_targets, val_predictions)
val_r2 = r2_score(val_targets, val_predictions)

print(f"Validation MAE: {val_mae:.4f}")
print(f"Validation R²: {val_r2:.4f}")











# Prédictions sur l'ensemble de test
model.eval()
test_predictions = []

with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        test_predictions.extend(outputs.cpu().numpy())

Y_test_nn = np.array(test_predictions).flatten()

# Sauvegarde des résultats
resultat = [(test_df['id'].iloc[i], Y_test_nn[i]) for i in range(len(Y_test_nn))]
with open('resultat_pytorch.csv', 'w') as f:
    f.write("id,co2\n")
    for id, co2 in resultat:
        f.write(f"{id},{int(round(co2))}\n")


Epoch 1/100, Train Loss: 2236.1250, Val Loss: 10.3136
Epoch 2/100, Train Loss: 65.0220, Val Loss: 12.8343
Epoch 3/100, Train Loss: 66.4627, Val Loss: 5.2645
Epoch 4/100, Train Loss: 59.7013, Val Loss: 15.3340
Epoch 5/100, Train Loss: 55.0323, Val Loss: 2249.5371
Epoch 6/100, Train Loss: 57.3860, Val Loss: 33.4772
Epoch 7/100, Train Loss: 65.2571, Val Loss: 11.9577
Epoch 8/100, Train Loss: 67.3008, Val Loss: 13.8311
Epoch 9/100, Train Loss: 54.3983, Val Loss: 69.2888
Epoch 10/100, Train Loss: 64.8601, Val Loss: 28.2510
Epoch 11/100, Train Loss: 58.9048, Val Loss: 1219.7742
Epoch 12/100, Train Loss: 58.6465, Val Loss: 126955.8423
Epoch 13/100, Train Loss: 53.9228, Val Loss: 3267.0298
Epoch 14/100, Train Loss: 58.4057, Val Loss: 5512.2051
Epoch 15/100, Train Loss: 57.4003, Val Loss: 180.2278
Epoch 16/100, Train Loss: 54.0021, Val Loss: 152.3959
Epoch 17/100, Train Loss: 52.3217, Val Loss: 2873.6886
Epoch 18/100, Train Loss: 48.4488, Val Loss: 2226.9489
Epoch 19/100, Train Loss: 43.1052, V