In [17]:
# Import des bibliothèques nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer



import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
# Chargement des données
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Sélection des features
numeric_features = ['max_power', 'weight_min', 'weight_max', 'urb_cons', 
                    'exturb_cons', 'overall_cons', 'co', 'hc', 'nox', 'hcnox', 'ptcl']
categorical_features = ['brand', 'model', 'car_class', 'fuel_type', 'hybrid', 'grbx_type_ratios', 'range']

# Créer une copie pour éviter de modifier les données originales
X_train = train_df.copy()
X_test = test_df.copy()


# 1. Gestion des valeurs manquantes et standardisation des variables numériques
for col in numeric_features:
    combined_values = pd.concat([X_train[col], X_test[col]])
    mean_value = combined_values.mean()
    std_value = combined_values.std()
    
    X_train[col] = X_train[col].fillna(mean_value)
    X_test[col] = X_test[col].fillna(mean_value)
    
    X_train[col] = (X_train[col] - mean_value) / std_value
    X_test[col] = (X_test[col] - mean_value) / std_value

# Standardisation de co2 et création de y_train
mean_value = X_train['co2'].mean()
std_value = X_train['co2'].std()
X_train['co2'] = (X_train['co2'] - mean_value) / std_value
y_train = X_train.pop('co2')







# 2. Encodage des variables catégoriques avec OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Initialisation de l'encodeur
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # sparse_output=False pour obtenir un array dense

# Combiner les données d'entraînement et de test pour fit l'encodeur
combined_cat = pd.concat([X_train[categorical_features], X_test[categorical_features]], axis=0)
ohe.fit(combined_cat)

# Transformer les données d'entraînement
X_train_ohe = ohe.transform(X_train[categorical_features])
X_train_ohe_df = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names_out(categorical_features), index=X_train.index)

# Transformer les données de test
X_test_ohe = ohe.transform(X_test[categorical_features])
X_test_ohe_df = pd.DataFrame(X_test_ohe, columns=ohe.get_feature_names_out(categorical_features), index=X_test.index)

# Remplacer les colonnes catégoriques par leurs versions encodées
X_train = pd.concat([X_train[numeric_features], X_train_ohe_df], axis=1)
X_test = pd.concat([X_test[numeric_features], X_test_ohe_df], axis=1)

# Vérification des types
print(X_train.dtypes)  # Assure-toi que tout est numérique (float64, int64, etc.)

# Conversion en tenseurs PyTorch (exemple)
class CO2Dataset:
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        if y is not None:
            self.y = torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1)
    

max_power               float64
weight_min              float64
weight_max              float64
urb_cons                float64
exturb_cons             float64
                         ...   
range_LUXE              float64
range_MOY-INFER         float64
range_MOY-INFERIEURE    float64
range_MOY-SUPER         float64
range_SUPERIEURE        float64
Length: 3919, dtype: object


In [19]:


X_train_nn, X_val, y_train_nn, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [20]:

NUM_EPOCHS = 50
BATCH_SIZE = 64
LR = 3e-4
WEIGHT_DECAY = 1e-6

In [21]:
"""
Réseau de neurones avec PyTorch pour la régression:
"""

# Vérifier si CUDA est disponible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de: {device}")

# Créer un dataset PyTorch personnalisé
class CO2Dataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        
        if y is not None:
            self.y = torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1)
            self.has_target = True
        else:
            self.has_target = False
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_target:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

# Définition du modèle de réseau de neurones
class ImprovedCO2RegressionNet(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], dropout_rates=[0.3, 0.2, 0.1]):
        super(ImprovedCO2RegressionNet, self).__init__()
        
        layers = []
        in_features = input_dim
        
        # Construction dynamique des couches cachées
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.Linear(in_features, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(hidden_dim))
            
            # Appliquer dropout sauf pour la dernière couche cachée
            if i < len(dropout_rates):
                layers.append(nn.Dropout(dropout_rates[i]))
            
            in_features = hidden_dim
        
        # Couche de sortie
        layers.append(nn.Linear(in_features, 1))
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)



# Création des datasets et dataloaders
train_dataset = CO2Dataset(X_train_nn, y_train_nn)
val_dataset = CO2Dataset(X_val, y_val)
test_dataset = CO2Dataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)



Utilisation de: cpu


In [22]:
# Initialisation du modèle, fonction de perte et optimiseur
input_dim = X_train.shape[1]
model = ImprovedCO2RegressionNet(
    input_dim, 
    hidden_dims=[256, 128, 64],  # Couches plus larges
    dropout_rates=[0.4, 0.3, 0.2]  # Dropout plus agressif
).to(device)

try:
    # Chargement des poids sauvegardés
    model.load_state_dict(torch.load('co2_regression_model.pth'))

except FileNotFoundError:
    print("Pas de modèle sauvegardé trouvé, initialisation d'un nouveau modèle.")

criterion = nn.MSELoss()  # Mean Squared Error pour la régression
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

RuntimeError: Error(s) in loading state_dict for ImprovedCO2RegressionNet:
	size mismatch for model.0.weight: copying a param with shape torch.Size([256, 19]) from checkpoint, the shape in current model is torch.Size([256, 3919]).

In [None]:
# Fonction d'entraînement
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100):
    train_losses = []
    val_losses = []
    
    best_val_loss = float('inf')
    best_model_state = None
    
    for epoch in range(num_epochs):
        # Mode entraînement
        model.train()
        running_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Remise à zéro des gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass et optimisation
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * X_batch.size(0)
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)
        
        # Mode évaluation
        model.eval()
        running_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                
                running_loss += loss.item() * X_batch.size(0)
        
        epoch_val_loss = running_loss / len(val_loader.dataset)
        val_losses.append(epoch_val_loss)
        
        # # Ajustement du learning rate
        # scheduler.step(epoch_val_loss)
        
        # Affichage des métriques
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
        
        # Sauvegarde du meilleur modèle
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = model.state_dict().copy()
    
    # Chargement du meilleur modèle
    model.load_state_dict(best_model_state)
    
    return model, train_losses, val_losses


In [None]:
# Entraînement du modèle
trained_model, train_losses, val_losses = train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS
)

torch.save(trained_model.state_dict(), 'co2_regression_model.pth')
print("Modèle sauvegardé dans 'co2_regression_model.pth'")

# Graphique des pertes
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Courbes d\'apprentissage')
plt.legend()
plt.grid(True)
plt.savefig('nn_training_loss.png')
plt.close()

# Évaluation sur l'ensemble de validation
model.eval()
val_predictions = []
val_targets = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        val_predictions.extend(outputs.cpu().numpy())
        val_targets.extend(y_batch.numpy())

val_predictions = np.array(val_predictions).flatten()
val_targets = np.array(val_targets).flatten()

val_mae = mean_absolute_error(val_targets, val_predictions)
val_r2 = r2_score(val_targets, val_predictions)

print(f"Validation MAE: {val_mae:.4f}")
print(f"Validation R²: {val_r2:.4f}")











# Prédictions sur l'ensemble de test
model.eval()
test_predictions = []

with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        test_predictions.extend(outputs.cpu().numpy())

Y_test_nn = np.array(test_predictions).flatten()

# Sauvegarde des résultats
resultat = [(test_df['id'].iloc[i], Y_test_nn[i]) for i in range(len(Y_test_nn))]
with open('resultat_pytorch.csv', 'w') as f:
    f.write("id,co2\n")
    for id, co2 in resultat:
        f.write(f"{id},{int(round(co2))}\n")


Epoch 1/50, Train Loss: 1.0079, Val Loss: 0.8474
Epoch 2/50, Train Loss: 0.9270, Val Loss: 0.8284
Epoch 3/50, Train Loss: 0.7248, Val Loss: 1.3569
Epoch 4/50, Train Loss: 0.4923, Val Loss: 0.9168
Epoch 5/50, Train Loss: 0.4006, Val Loss: 0.3615
Epoch 6/50, Train Loss: 0.3436, Val Loss: 3.5706
Epoch 7/50, Train Loss: 0.3061, Val Loss: 0.6187
Epoch 8/50, Train Loss: 0.2692, Val Loss: 0.3626
Epoch 9/50, Train Loss: 0.2524, Val Loss: 0.3822
Epoch 10/50, Train Loss: 0.2385, Val Loss: 0.4963
Epoch 11/50, Train Loss: 0.2242, Val Loss: 0.8814
Epoch 12/50, Train Loss: 0.2139, Val Loss: 0.7698
Epoch 13/50, Train Loss: 0.2053, Val Loss: 0.7588
Epoch 14/50, Train Loss: 0.2158, Val Loss: 6.6979
Epoch 15/50, Train Loss: 0.2797, Val Loss: 6.9334
Epoch 16/50, Train Loss: 0.2011, Val Loss: 3.8406
Epoch 17/50, Train Loss: 0.1915, Val Loss: 5.9363
Epoch 18/50, Train Loss: 0.1789, Val Loss: 5.5623
Epoch 19/50, Train Loss: 0.1810, Val Loss: 8.6932
Epoch 20/50, Train Loss: 0.1729, Val Loss: 3.8459
Epoch 21/