In [39]:
# Import des bibliothèques nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [40]:
# Chargement des données
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Remove model:
train_df.drop('model', axis=1, inplace=True)
test_df.drop('model', axis=1, inplace=True)

# Sélection des features
numeric_features = ['max_power', 'weight_min', 'weight_max', 'urb_cons', 
                    'exturb_cons', 'overall_cons', 'co', 'hc', 'nox', 'hcnox', 'ptcl']
categorical_features = ['brand', 'car_class', 'fuel_type', 'hybrid', 'grbx_type_ratios', 'range']

# Créer une copie pour éviter de modifier les données originales
X_train = train_df.copy()
X_test = test_df.copy()


# 1. Gestion des valeurs manquantes et standardisation des variables numériques
for col in numeric_features:
    combined_values = pd.concat([X_train[col], X_test[col]])
    mean_value = combined_values.mean()
    std_value = combined_values.std()
    
    X_train[col] = X_train[col].fillna(mean_value)
    X_test[col] = X_test[col].fillna(mean_value)
    
    X_train[col] = (X_train[col] - mean_value) / std_value
    X_test[col] = (X_test[col] - mean_value) / std_value

# Standardisation de co2 et création de y_train
# mean_value = X_train['co2'].mean()
# std_value = X_train['co2'].std()
# X_train['co2'] = (X_train['co2'] - mean_value) / std_value
y_train = X_train.pop('co2')










# 2. Encodage des variables catégoriques avec OneHotEncoder

# Initialisation du OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit sur les données d'entraînement uniquement
encoder.fit(X_train[categorical_features])

# Transformation des données d'entraînement et de test
encoded_train = encoder.transform(X_train[categorical_features])
encoded_test = encoder.transform(X_test[categorical_features])

# Création de DataFrame à partir des données encodées avec les noms des catégories
encoded_feature_names = []
for i, feature in enumerate(categorical_features):
    categories = encoder.categories_[i]
    for category in categories:
        encoded_feature_names.append(f"{feature}_{category}")
    

# Conversion des données encodées en DataFrame
encoded_train_df = pd.DataFrame(encoded_train, columns=encoded_feature_names, index=X_train.index)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_feature_names, index=X_test.index)

# Suppression des colonnes catégoriques originales et ajout des colonnes encodées
X_train_numeric = X_train.drop(categorical_features, axis=1)
X_test_numeric = X_test.drop(categorical_features, axis=1)

# Concaténation des variables numériques et catégoriques encodées
X_train = pd.concat([X_train_numeric, encoded_train_df], axis=1)
X_test = pd.concat([X_test_numeric, encoded_test_df], axis=1)

# Vérification du résultat
print(f"Forme des données d'entraînement finales: {X_train.shape}")
print(f"Forme des données de test finales: {X_test.shape}")

X_train = X_train.drop('id', axis=1)
X_test = X_test.drop('id', axis=1)

print(X_train.info())
X_train.head()
    

Forme des données d'entraînement finales: (41257, 103)
Forme des données de test finales: (13753, 103)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41257 entries, 0 to 41256
Columns: 102 entries, max_power to range_SUPERIEURE
dtypes: float64(102)
memory usage: 32.1 MB
None


Unnamed: 0,max_power,weight_min,weight_max,urb_cons,exturb_cons,overall_cons,co,hc,nox,hcnox,...,grbx_type_ratios_S 6,grbx_type_ratios_V .,grbx_type_ratios_V 0,range_ECONOMIQUE,range_INFERIEURE,range_LUXE,range_MOY-INFER,range_MOY-INFERIEURE,range_MOY-SUPER,range_SUPERIEURE
0,-1.080623,-0.431333,-0.630533,-0.228571,-0.317796,-0.264052,-0.675846,0.0,-0.168272,0.434755,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.419458,0.283602,0.031296,0.299729,0.259863,0.314959,-0.710325,0.0,-0.180264,-0.018698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.462095,1.645382,1.246225,1.404358,2.185393,1.834861,-0.78618,-0.651084,3.709942,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.75004,-2.80083,-2.509655,-1.525309,-2.050773,-1.783954,-0.096592,0.645676,-0.623968,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.021319,0.862358,0.255845,0.251702,0.163587,0.242583,-0.958577,0.0,-0.261809,-0.872257,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [41]:


X_train_nn, X_val, y_train_nn, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [None]:

NUM_EPOCHS = 25
BATCH_SIZE = 64
LR = 1e-3
WEIGHT_DECAY = 1e-5
active_scheduler = False

In [43]:
"""
Réseau de neurones avec PyTorch pour la régression:
"""

# Vérifier si CUDA est disponible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Utilisation de: {device}")

# Créer un dataset PyTorch personnalisé
class CO2Dataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        
        if y is not None:
            self.y = torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1)
            self.has_target = True
        else:
            self.has_target = False
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_target:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

# Définition du modèle de réseau de neurones
class CO2RegressionNet(nn.Module):
    def __init__(self, input_dim):
        super(CO2RegressionNet, self).__init__()
        
        # Architecture plus stable
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            # nn.BatchNorm1d(128),
            # nn.Dropout(0.3),  # Ajouter du dropout pour régulariser
            
            nn.Linear(128, 128),
            nn.ReLU(),
            # nn.BatchNorm1d(128),
            # nn.Dropout(0.2),  
            
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        return self.model(x)


# Création des datasets et dataloaders
train_dataset = CO2Dataset(X_train_nn, y_train_nn)
val_dataset = CO2Dataset(X_val, y_val)
test_dataset = CO2Dataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)



Utilisation de: cpu


In [44]:
# Initialisation du modèle, fonction de perte et optimiseur
input_dim = X_train.shape[1]
model = CO2RegressionNet(input_dim).to(device)

try:
    # Chargement des poids sauvegardés
    nomModel = input("Prendre un modèle ? (entrez le nom du fichier en entier ou non pour continuer sans) : ")
    if nomModel != "non":
        model.load_state_dict(torch.load(nomModel + '.pth'))
    else:
        print("Vous avez décidé de ne pas charger de modèle. Initialisation d'un nouveau modèle")

except FileNotFoundError:
    print("Pas de modèle sauvegardé trouvé, initialisation d'un nouveau modèle.")


def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        m.bias.data.fill_(0.01)
        
model.apply(init_weights)



criterion = nn.MSELoss()  # Mean Squared Error pour la régression
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min',
    factor=0.7,       # Réduction plus douce (0.7 au lieu de 0.5)
    patience=5,       # Plus de patience (5 au lieu de 3)
    threshold=0.01,   # Seuil moins restrictif
    min_lr=1e-6,
    verbose=True
)

Vous avez décidé de ne pas charger de modèle. Initialisation d'un nouveau modèle




In [45]:
# Fonction d'entraînement
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100, scheduler=None):
    train_losses = []
    val_losses = []
    
    best_val_loss = float('inf')
    best_model_state = None
    
    for epoch in range(num_epochs):
        # Mode entraînement
        model.train()
        running_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Remise à zéro des gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass et optimisation
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            running_loss += loss.item() * X_batch.size(0)
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)
        
        # Mode évaluation
        model.eval()
        running_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                
                running_loss += loss.item() * X_batch.size(0)
        
        epoch_val_loss = running_loss / len(val_loader.dataset)
        val_losses.append(epoch_val_loss)
        
        # Ajustement du learning rate
        if scheduler is not None and active_scheduler:
            # Comme c'est un ReduceLROnPlateau, nous lui passons la perte de validation
            scheduler.step(epoch_val_loss)
        
        # Récupération du learning rate actuel pour l'affichage
        current_lr = optimizer.param_groups[0]['lr']
        
        # Affichage des métriques
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, LR: {current_lr:.6f}')
        
        # Sauvegarde du meilleur modèle
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = model.state_dict().copy()
    
    # Chargement du meilleur modèle
    model.load_state_dict(best_model_state)
    
    return model, train_losses, val_losses

In [None]:
# Entraînement du modèle

print("Démarage de l'entraînement avec les paramètres suivants:")
print(f"-\t Learning Rate: {LR}")
print(f"-\t Weight Decay: {WEIGHT_DECAY}")
print(f"-\t Active Scheduler: {active_scheduler}")
print(f"-\t Batch Size: {BATCH_SIZE}")


print("\n")


trained_model, train_losses, val_losses = train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS, scheduler=scheduler
)

torch.save(trained_model.state_dict(), 'co2_regression_model.pth')
print("Modèle sauvegardé dans 'co2_regression_model.pth'")

# Graphique des pertes
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Courbes d\'apprentissage')
plt.legend()
plt.grid(True)
plt.savefig('nn_training_loss.png')
plt.close()

# Évaluation sur l'ensemble de validation
model.eval()
val_predictions = []
val_targets = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        val_predictions.extend(outputs.cpu().numpy())
        val_targets.extend(y_batch.numpy())

val_predictions = np.array(val_predictions).flatten()
val_targets = np.array(val_targets).flatten()

val_mae = mean_absolute_error(val_targets, val_predictions)
val_r2 = r2_score(val_targets, val_predictions)

print(f"Validation MAE: {val_mae:.4f}")
print(f"Validation R²: {val_r2:.4f}")











# Prédictions sur l'ensemble de test
model.eval()
test_predictions = []

with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        test_predictions.extend(outputs.cpu().numpy())

Y_test_nn = np.array(test_predictions).flatten()

# Sauvegarde des résultats
resultat = [(test_df['id'].iloc[i], Y_test_nn[i]) for i in range(len(Y_test_nn))]
with open('resultat_pytorch.csv', 'w') as f:
    f.write("id,co2\n")
    for id, co2 in resultat:
        f.write(f"{id},{co2}\n")


Démarage de l'entraînement avec les paramètres suivants:
-	 Learning Rate: 0.001
-	 Weight Decay: 1e-05
-	 Active Scheduler: False
-	 Batch Size: 64


Epoch 1/100, Train Loss: 4127.3187, Val Loss: 48.1855, LR: 0.001000
Epoch 2/100, Train Loss: 16.6860, Val Loss: 29.6775, LR: 0.001000
Epoch 3/100, Train Loss: 6.0468, Val Loss: 23.6951, LR: 0.001000
Epoch 4/100, Train Loss: 2.8338, Val Loss: 21.8772, LR: 0.001000
Epoch 5/100, Train Loss: 1.7268, Val Loss: 20.5640, LR: 0.001000
Epoch 6/100, Train Loss: 1.1653, Val Loss: 19.4495, LR: 0.001000
Epoch 7/100, Train Loss: 0.9169, Val Loss: 19.4314, LR: 0.001000
Epoch 8/100, Train Loss: 0.7087, Val Loss: 18.2100, LR: 0.001000
Epoch 9/100, Train Loss: 0.6264, Val Loss: 18.1903, LR: 0.001000
Epoch 10/100, Train Loss: 0.6051, Val Loss: 17.6040, LR: 0.001000
Epoch 11/100, Train Loss: 0.5815, Val Loss: 17.3369, LR: 0.001000
Epoch 12/100, Train Loss: 0.5896, Val Loss: 16.1072, LR: 0.001000
Epoch 13/100, Train Loss: 0.5479, Val Loss: 16.2845, LR: 0.001