In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import os


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.linear_model import Ridge

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold

In [12]:
"""
Chargement des données:
"""
sample_df = pd.read_csv('sample_submission.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.head()

Unnamed: 0,id,brand,model,car_class,range,fuel_type,hybrid,max_power,grbx_type_ratios,weight_min,weight_max,urb_cons,exturb_cons,overall_cons,co,hc,nox,hcnox,ptcl,co2
0,0,MERCEDES,COMBI 110 CDI,MINIBUS,MOY-INFER,GO,non,70.0,M 6,1976,2075,9.1,6.4,7.4,0.083,,0.229,0.25,0.001,195
1,1,MERCEDES,VIANO 2.0 CDI,MINIBUS,MOY-SUPER,GO,non,100.0,A 5,2186,2355,10.2,7.0,8.2,0.078,,0.224,0.233,0.001,216
2,2,MERCEDES,SPRINTER COMBI 319 CDI,MINIBUS,MOY-INFER,GO,non,140.0,A 5,2586,2869,12.5,9.0,10.3,0.067,0.014,1.846,,0.002,272
3,3,RENAULT,MEGANE Coupé EnergyTCe (115ch) eco2,COUPE,MOY-INFER,ES,non,85.0,M 6,1280,1280,6.4,4.6,5.3,0.167,0.039,0.039,,0.001,119
4,4,MERCEDES,COMBI 116 CDI,MINIBUS,MOY-INFER,GO,non,120.0,A 5,2356,2450,10.1,6.9,8.1,0.042,,0.19,0.201,0.001,214


In [13]:
"""
Gestion des nan:
"""

# Identification des colonnes avec au moins un NaN dans train_df
colonnes_avec_nan = test_df.columns[test_df.isna().any()].tolist()
train_df[colonnes_avec_nan] = train_df[colonnes_avec_nan].fillna(train_df[colonnes_avec_nan].mean())
test_df[colonnes_avec_nan] = test_df[colonnes_avec_nan].fillna(test_df[colonnes_avec_nan].mean())


"""
Probleme avec les strings (LabelEncoder):
"""


colonnes_string = ['brand', 'model', 'car_class', 'fuel_type', 'hybrid', 'grbx_type_ratios', 'range']

# Conversion des colonnes en type 'category'
for colonne in colonnes_string:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[colonne], test_df[colonne]]))
    train_df[colonne] = le.transform(train_df[colonne])
    test_df[colonne] = le.transform(test_df[colonne])
    
    
"""
Normalisation des données:
"""
numeric_colons = ['max_power', 'weight_min', 'weight_max', 'urb_cons', 'exturb_cons', 'overall_cons', 'co', 'nox', 'ptcl']
scaler = StandardScaler()
train_df[numeric_colons] = scaler.fit_transform(train_df[numeric_colons])
test_df[numeric_colons] = scaler.transform(test_df[numeric_colons])


print("\n\n")
train_df.head()






Unnamed: 0,id,brand,model,car_class,range,fuel_type,hybrid,max_power,grbx_type_ratios,weight_min,weight_max,urb_cons,exturb_cons,overall_cons,co,hc,nox,hcnox,ptcl,co2
0,0,26,1355,6,3,10,0,-1.081731,10,-0.431687,-0.631726,-0.226086,-0.317102,-0.262405,-0.675586,0.026455,-0.170442,0.25,0.107472,195
1,1,26,3608,6,5,10,0,-0.419216,1,0.283067,0.029746,0.302808,0.262172,0.31785,-0.710046,0.026455,-0.182388,0.233,0.107472,216
2,2,26,3282,6,3,10,0,0.464137,1,1.644503,1.244019,1.408676,2.193088,1.84102,-0.785857,0.014,3.693046,0.233684,1.138173,272
3,3,33,2325,5,3,2,0,-0.750474,10,-2.800586,-2.509832,-1.524279,-2.054927,-1.785574,-0.096663,0.039,-0.624407,0.233684,0.107472,119
4,4,26,1358,6,3,10,0,0.02246,1,0.861677,0.254174,0.254727,0.165626,0.245319,-0.958156,0.026455,-0.263624,0.201,0.107472,214


In [14]:
""" 
Jeu de données:
"""
X_train = train_df[[col for col in train_df.columns if col != 'co2' and col != 'id']]
Y_train = train_df['co2']

X_test = test_df[[col for col in test_df.columns if col != 'id']]
id_test = test_df['id']

In [None]:
"""
Réseau de neurones avec PyTorch pour la régression:
"""
# Installation si nécessaire
# !pip install torch scikit-learn numpy matplotlib

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Vérifier si CUDA est disponible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de: {device}")

# Créer un dataset PyTorch personnalisé
class CO2Dataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        
        if y is not None:
            self.y = torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1)
            self.has_target = True
        else:
            self.has_target = False
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_target:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

# Définition du modèle de réseau de neurones
class CO2RegressionNet(nn.Module):
    def __init__(self, input_dim):
        super(CO2RegressionNet, self).__init__()
        
        # Architecture du réseau
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.model(x)

# Séparation des données en ensembles d'entraînement et de validation
X_train_nn, X_val, y_train_nn, y_val = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=42
)

# Création des datasets et dataloaders
train_dataset = CO2Dataset(X_train_nn, y_train_nn)
val_dataset = CO2Dataset(X_val, y_val)
test_dataset = CO2Dataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Initialisation du modèle, fonction de perte et optimiseur
input_dim = X_train.shape[1]
model = CO2RegressionNet(input_dim).to(device)
criterion = nn.MSELoss()  # Mean Squared Error pour la régression
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Fonction d'entraînement
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=100, early_stopping_patience=15):
    train_losses = []
    val_losses = []
    
    best_val_loss = float('inf')
    best_model_state = None
    no_improve_epochs = 0
    
    for epoch in range(num_epochs):
        # Mode entraînement
        model.train()
        running_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Remise à zéro des gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass et optimisation
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * X_batch.size(0)
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)
        
        # Mode évaluation
        model.eval()
        running_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                
                running_loss += loss.item() * X_batch.size(0)
        
        epoch_val_loss = running_loss / len(val_loader.dataset)
        val_losses.append(epoch_val_loss)
        
        # Ajustement du learning rate
        scheduler.step(epoch_val_loss)
        
        # Affichage des métriques
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
        
        # Sauvegarde du meilleur modèle
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = model.state_dict().copy()
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
            
        # Early stopping
        if no_improve_epochs >= early_stopping_patience:
            print(f'Early stopping après {epoch+1} epochs sans amélioration.')
            break
    
    # Chargement du meilleur modèle
    model.load_state_dict(best_model_state)
    
    return model, train_losses, val_losses








# Entraînement du modèle
trained_model, train_losses, val_losses = train_model(
    model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=100, early_stopping_patience=15
)

# Graphique des pertes
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Courbes d\'apprentissage')
plt.legend()
plt.grid(True)
plt.savefig('nn_training_loss.png')
plt.close()

# Évaluation sur l'ensemble de validation
model.eval()
val_predictions = []
val_targets = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        val_predictions.extend(outputs.cpu().numpy())
        val_targets.extend(y_batch.numpy())

val_predictions = np.array(val_predictions).flatten()
val_targets = np.array(val_targets).flatten()

val_mae = mean_absolute_error(val_targets, val_predictions)
val_r2 = r2_score(val_targets, val_predictions)

print(f"Validation MAE: {val_mae:.4f}")
print(f"Validation R²: {val_r2:.4f}")

# Prédictions sur l'ensemble de test
model.eval()
test_predictions = []

with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        test_predictions.extend(outputs.cpu().numpy())

Y_test_nn = np.array(test_predictions).flatten()

# Sauvegarde des résultats
resultat = [(id_test[i], Y_test_nn[i]) for i in range(len(Y_test_nn))]
with open('resultat_pytorch.csv', 'w') as f:
    f.write("id,co2\n")
    for id, co2 in resultat:
        f.write(f"{id},{int(round(co2))}\n")

# Comparaison des prédictions du modèle PyTorch avec le modèle original
if 'model' in globals() and hasattr(model, 'predict'):
    Y_pred_original = model.predict(X_val.values)
    mae_original = mean_absolute_error(y_val, Y_pred_original)
    r2_original = r2_score(y_val, Y_pred_original)
    
    print("\nComparaison avec le modèle original:")
    print(f"MAE original: {mae_original:.4f}, R² original: {r2_original:.4f}")
    print(f"MAE PyTorch: {val_mae:.4f}, R² PyTorch: {val_r2:.4f}")
    
    # Amélioration en pourcentage
    if mae_original > 0:
        mae_improvement = ((mae_original - val_mae) / mae_original) * 100
        print(f"Amélioration MAE: {mae_improvement:.2f}%")

Utilisation de: cpu




Epoch 1/100, Train Loss: 40411.7193, Val Loss: 35985.1706
Epoch 2/100, Train Loss: 35051.2181, Val Loss: 25754.6268
Epoch 3/100, Train Loss: 26760.2766, Val Loss: 22382.8815
Epoch 4/100, Train Loss: 17305.3764, Val Loss: 11172.5017
Epoch 5/100, Train Loss: 9239.1603, Val Loss: 2437.5294
Epoch 6/100, Train Loss: 3942.5832, Val Loss: 3621.3545
Epoch 7/100, Train Loss: 1286.0781, Val Loss: 443.7314
Epoch 8/100, Train Loss: 345.0432, Val Loss: 263.5792
Epoch 9/100, Train Loss: 117.9364, Val Loss: 724.9064
Epoch 10/100, Train Loss: 73.9666, Val Loss: 553.0291
Epoch 11/100, Train Loss: 70.2136, Val Loss: 75.3806
Epoch 12/100, Train Loss: 63.9276, Val Loss: 90.9578
Epoch 13/100, Train Loss: 67.6891, Val Loss: 299.5958
Epoch 14/100, Train Loss: 65.4787, Val Loss: 124.3691
Epoch 15/100, Train Loss: 63.7484, Val Loss: 194.7418
Epoch 16/100, Train Loss: 66.2151, Val Loss: 1940.9468
Epoch 17/100, Train Loss: 61.4348, Val Loss: 133.7375
Epoch 18/100, Train Loss: 57.3240, Val Loss: 402.3869
Epoch 19