In [1]:
from sklearn.datasets import fetch_covtype
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
import torch 
from torch.utils.data import DataLoader , TensorDataset 
import torchvision
import torchvision.transforms.v2 as T
import torchmetrics
import torch.nn as nn
import optuna

In [3]:
cov_type = fetch_covtype(random_state=42)

In [4]:
print(cov_type.DESCR)

.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30Ã—30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

Classes                        7
Samples total             581012
Dimensionality                54
Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``. If optional argument 'as_frame' is
set to 'True', it will return ``data`` and ``target`` as pandas
dat

In [5]:
x_train_full , x_valid , y_train_full , y_valid = train_test_split(cov_type.data , cov_type.target , random_state=42 , 
                                                                   test_size=0.15)

x_train , x_test , y_train , y_test = train_test_split(x_train_full , y_train_full , random_state=42,
                                                       test_size=0.15)

In [6]:
cov_type.target_names

['Cover_Type']

In [7]:
y_train[0]

np.int32(2)

In [8]:
x_train.shape
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

In [9]:
if y_train.min() >0:
    y_test = y_test - 1
    y_train = y_train - 1
    y_valid = y_valid - 1

In [10]:
x_train = torch.FloatTensor(x_train_scaled)
y_train = torch.LongTensor(y_train)
x_valid = torch.FloatTensor(x_valid_scaled)
y_valid = torch.LongTensor(y_valid)
x_test = torch.FloatTensor(x_test_scaled)
y_test = torch.LongTensor(y_test)

In [11]:
print(torch.unique(y_test))

tensor([0, 1, 2, 3, 4, 5, 6])


In [None]:
train_dataset = TensorDataset(x_train , y_train)
train_loader = DataLoader(train_dataset , shuffle=True , pin_memory=True , batch_size=128 , 
                          num_workers=4)

val_dataset = TensorDataset(x_valid , y_valid)
val_loader = DataLoader(val_dataset , shuffle=True , pin_memory=True , batch_size=128 , 
                          num_workers=4)

test_dataset = TensorDataset(x_test , y_test)
test_loader = DataLoader(test_dataset , shuffle=True , pin_memory=True , batch_size=128 , 
                          num_workers=4)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"The train and eval function will use device : {device}" )


def eval(model , val_loader ,metric):
    with torch.no_grad():
        metric.reset()
        model.eval()
        for x_val_batch , y_val_batch in val_loader:
            x_val_batch , y_val_batch = x_val_batch.to(device) , y_val_batch.to(device)
            y_pred_val = model(x_val_batch)
            metric.update(y_pred_val , y_val_batch)
            
    return metric.compute()


def train_and_eval(model , train_loader : DataLoader, valid_loader : DataLoader , 
                   criterion , metric , optimizer) -> dict:
    
    history = {'train_loss' : [],
               'train_accuracy' : [],
               'val_accuracy' : []}
    

    metric.reset()
    model.train()
    total_loss = 0
    
    for x_batch , y_batch in train_loader:
        x_batch , y_batch = x_batch.to(device) , y_batch.to(device)
        
        #forward
        y_pred = model(x_batch)
        
        #loss
        loss = criterion(y_pred , y_batch)
        total_loss += loss.item()
        
        #optimizer
        optimizer.zero_grad()
        
        #back
        loss.backward()
        optimizer.step()
        
        metric.update(y_pred , y_batch)
            
    mean_loss = total_loss/len(train_loader)
    history['train_loss'].append(mean_loss)
    
    history['train_accuracy'].append(metric.compute().item())
    
    history['val_accuracy'].append(eval(model , val_loader , metric).item())
    
    print(f"Training loss : {history['train_loss'][-1]:.4f}  ",
            f"Training accuracy : {history['train_accuracy'][-1]:.4f}  ",
            f"val accuracy : {history['val_accuracy'][-1]:.4f}")
    
    return history
        
            
            
        

The train and eval function will use device : cuda


In [None]:
class CovTypeClassifier(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_layers, n_classes, dropout_rate=0.2):
        super().__init__()

        self.hidden_layers = nn.ModuleList()

        self.hidden_layers.append(nn.Linear(n_inputs, n_hidden))
        self.hidden_layers.append(nn.BatchNorm1d(n_hidden))
        self.hidden_layers.append(nn.ReLU())
        self.hidden_layers.append(nn.Dropout(dropout_rate))
        
        for _ in range(n_layers - 1):
            self.hidden_layers.append(nn.Linear(n_hidden, n_hidden))
            self.hidden_layers.append(nn.BatchNorm1d(n_hidden))
            self.hidden_layers.append(nn.ReLU())
            self.hidden_layers.append(nn.Dropout(dropout_rate))

        self.output = nn.Linear(n_hidden, n_classes)
        
    def forward(self, x):
        x = x.flatten(1) 
        
        for layer in self.hidden_layers:
            x = layer(x)
            
        return self.output(x)


def use_he_init(module):
    if isinstance(module , nn.Linear):
        nn.init.kaiming_uniform_(module.weight)
        nn.init.zeros_(module.bias)
        


In [None]:
n_epoch = 20

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5 , 1e-1 , log=True)
    n_layers = trial.suggest_int("n_layers", 2, 4)
    n_hidden = trial.suggest_int("n_hidden" , 20 , 300)
    dropout_rate = trial.suggest_float("dropout", 0.1, 0.5)
    
    
    model = CovTypeClassifier(n_inputs= 54 , n_hidden=n_hidden , n_layers = n_layers,
                               n_classes=7 , dropout_rate = dropout_rate).to(device)
    model.apply(use_he_init)
    optimizer = torch.optim.Adam(model.parameters() , lr=learning_rate)
    accuracy = torchmetrics.Accuracy(task='multiclass' , num_classes=7).to(device)
    xentropy = nn.CrossEntropyLoss()
    
    best_val_score = 0.0
    
    for epoch in range(n_epoch):
        history = train_and_eval(model , train_loader , val_loader , xentropy, 
                                 accuracy , optimizer)
        
        val_score = max(history['val_accuracy'])
        
        if val_score > best_val_score:
            best_val_score = val_score
            
        trial.report(val_score , epoch)
        
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    
    return best_val_score

In [16]:
torch.manual_seed(42)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize' , sampler=sampler)
study.optimize(objective , n_trials=5)

[32m[I 2026-02-09 17:10:15,265][0m A new study created in memory with name: no-name-a5c8df73-ccb4-48b2-ad10-f56a86250c9e[0m


Training loss : 1.4293   Training accuracy : 0.4959   val accuracy : 0.5738
Training loss : 1.1221   Training accuracy : 0.6005   val accuracy : 0.6218
Training loss : 0.9909   Training accuracy : 0.6364   val accuracy : 0.6478
Training loss : 0.9162   Training accuracy : 0.6578   val accuracy : 0.6651
Training loss : 0.8691   Training accuracy : 0.6706   val accuracy : 0.6743
Training loss : 0.8364   Training accuracy : 0.6784   val accuracy : 0.6814
Training loss : 0.8118   Training accuracy : 0.6839   val accuracy : 0.6866
Training loss : 0.7926   Training accuracy : 0.6889   val accuracy : 0.6920
Training loss : 0.7772   Training accuracy : 0.6934   val accuracy : 0.6967
Training loss : 0.7645   Training accuracy : 0.6974   val accuracy : 0.7001
Training loss : 0.7538   Training accuracy : 0.7006   val accuracy : 0.7037
Training loss : 0.7446   Training accuracy : 0.7040   val accuracy : 0.7061
Training loss : 0.7365   Training accuracy : 0.7068   val accuracy : 0.7087
Training los

[32m[I 2026-02-09 17:11:33,388][0m Trial 0 finished with value: 0.7217161059379578 and parameters: {'learning_rate': 0.00031489116479568613, 'n_hidden': 287}. Best is trial 0 with value: 0.7217161059379578.[0m


Training loss : 0.6984   Training accuracy : 0.7193   val accuracy : 0.7217
Training loss : 0.8457   Training accuracy : 0.6707   val accuracy : 0.7224
Training loss : 0.6720   Training accuracy : 0.7272   val accuracy : 0.7350
Training loss : 0.6395   Training accuracy : 0.7360   val accuracy : 0.7410
Training loss : 0.6195   Training accuracy : 0.7422   val accuracy : 0.7454
Training loss : 0.6048   Training accuracy : 0.7465   val accuracy : 0.7481
Training loss : 0.5931   Training accuracy : 0.7503   val accuracy : 0.7538
Training loss : 0.5830   Training accuracy : 0.7539   val accuracy : 0.7555
Training loss : 0.5741   Training accuracy : 0.7570   val accuracy : 0.7594
Training loss : 0.5662   Training accuracy : 0.7601   val accuracy : 0.7620
Training loss : 0.5589   Training accuracy : 0.7626   val accuracy : 0.7660
Training loss : 0.5523   Training accuracy : 0.7654   val accuracy : 0.7677
Training loss : 0.5462   Training accuracy : 0.7677   val accuracy : 0.7703
Training los

[32m[I 2026-02-09 17:12:49,228][0m Trial 1 finished with value: 0.7840325236320496 and parameters: {'learning_rate': 0.008471801418819975, 'n_hidden': 188}. Best is trial 1 with value: 0.7840325236320496.[0m


Training loss : 0.5077   Training accuracy : 0.7849   val accuracy : 0.7840
Training loss : 2.1066   Training accuracy : 0.2381   val accuracy : 0.2749
Training loss : 1.9268   Training accuracy : 0.3084   val accuracy : 0.3419
Training loss : 1.8053   Training accuracy : 0.3665   val accuracy : 0.3911
Training loss : 1.7184   Training accuracy : 0.4077   val accuracy : 0.4238
Training loss : 1.6528   Training accuracy : 0.4347   val accuracy : 0.4464
Training loss : 1.6008   Training accuracy : 0.4528   val accuracy : 0.4615
Training loss : 1.5578   Training accuracy : 0.4659   val accuracy : 0.4726
Training loss : 1.5212   Training accuracy : 0.4754   val accuracy : 0.4811
Training loss : 1.4892   Training accuracy : 0.4820   val accuracy : 0.4875
Training loss : 1.4607   Training accuracy : 0.4879   val accuracy : 0.4927
Training loss : 1.4350   Training accuracy : 0.4933   val accuracy : 0.4979
Training loss : 1.4115   Training accuracy : 0.4989   val accuracy : 0.5033
Training los

[32m[I 2026-02-09 17:13:58,384][0m Trial 2 finished with value: 0.5471245646476746 and parameters: {'learning_rate': 4.207988669606632e-05, 'n_hidden': 63}. Best is trial 1 with value: 0.7840325236320496.[0m


Training loss : 1.2721   Training accuracy : 0.5416   val accuracy : 0.5471
Training loss : 1.9642   Training accuracy : 0.2551   val accuracy : 0.2978
Training loss : 1.7427   Training accuracy : 0.3345   val accuracy : 0.3622
Training loss : 1.6341   Training accuracy : 0.3870   val accuracy : 0.4042
Training loss : 1.5684   Training accuracy : 0.4193   val accuracy : 0.4292
Training loss : 1.5215   Training accuracy : 0.4391   val accuracy : 0.4445
Training loss : 1.4842   Training accuracy : 0.4523   val accuracy : 0.4561
Training loss : 1.4525   Training accuracy : 0.4618   val accuracy : 0.4643
Training loss : 1.4242   Training accuracy : 0.4694   val accuracy : 0.4713
Training loss : 1.3985   Training accuracy : 0.4765   val accuracy : 0.4780
Training loss : 1.3746   Training accuracy : 0.4826   val accuracy : 0.4839
Training loss : 1.3521   Training accuracy : 0.4886   val accuracy : 0.4896
Training loss : 1.3311   Training accuracy : 0.4943   val accuracy : 0.4953
Training los

[32m[I 2026-02-09 17:15:08,202][0m Trial 3 finished with value: 0.5367059707641602 and parameters: {'learning_rate': 1.7073967431528103e-05, 'n_hidden': 263}. Best is trial 1 with value: 0.7840325236320496.[0m


Training loss : 1.1976   Training accuracy : 0.5367   val accuracy : 0.5367
Training loss : 1.0258   Training accuracy : 0.6070   val accuracy : 0.6713
Training loss : 0.7648   Training accuracy : 0.6876   val accuracy : 0.7041
Training loss : 0.7092   Training accuracy : 0.7085   val accuracy : 0.7185
Training loss : 0.6820   Training accuracy : 0.7191   val accuracy : 0.7256
Training loss : 0.6650   Training accuracy : 0.7255   val accuracy : 0.7303
Training loss : 0.6528   Training accuracy : 0.7301   val accuracy : 0.7351
Training loss : 0.6432   Training accuracy : 0.7335   val accuracy : 0.7375
Training loss : 0.6350   Training accuracy : 0.7366   val accuracy : 0.7402
Training loss : 0.6279   Training accuracy : 0.7392   val accuracy : 0.7420
Training loss : 0.6215   Training accuracy : 0.7417   val accuracy : 0.7440
Training loss : 0.6158   Training accuracy : 0.7438   val accuracy : 0.7466
Training loss : 0.6104   Training accuracy : 0.7457   val accuracy : 0.7480
Training los

[32m[I 2026-02-09 17:16:29,245][0m Trial 4 finished with value: 0.7599940299987793 and parameters: {'learning_rate': 0.002537815508265664, 'n_hidden': 218}. Best is trial 1 with value: 0.7840325236320496.[0m


Training loss : 0.5780   Training accuracy : 0.7580   val accuracy : 0.7600


In [22]:
learning_rate_best , n_hidden_best = study.best_params['learning_rate'] , study.best_params['n_hidden']

In [23]:
print(learning_rate_best , n_hidden_best)

0.008471801418819975 188


In [25]:
model = CovTypeClassifier(n_inputs=54 , n_hidden1=n_hidden_best , 
                          n_hidden2= n_hidden_best , n_classes=7).to(device)

model.apply(use_he_init)
optimizer = torch.optim.SGD(model.parameters() , lr=learning_rate_best)
accuracy = torchmetrics.Accuracy(task='multiclass' , num_classes=7).to(device)
xentropy = nn.CrossEntropyLoss()

for epoch in range(n_epoch):
    history_best = train_and_eval(model , train_loader , val_loader , xentropy, 
                                 accuracy , optimizer)
    

Training loss : 0.8321   Training accuracy : 0.6777   val accuracy : 0.7161
Training loss : 0.6783   Training accuracy : 0.7228   val accuracy : 0.7324
Training loss : 0.6441   Training accuracy : 0.7342   val accuracy : 0.7405
Training loss : 0.6233   Training accuracy : 0.7408   val accuracy : 0.7452
Training loss : 0.6078   Training accuracy : 0.7461   val accuracy : 0.7494
Training loss : 0.5954   Training accuracy : 0.7502   val accuracy : 0.7532
Training loss : 0.5848   Training accuracy : 0.7538   val accuracy : 0.7581
Training loss : 0.5753   Training accuracy : 0.7573   val accuracy : 0.7607
Training loss : 0.5669   Training accuracy : 0.7608   val accuracy : 0.7641
Training loss : 0.5592   Training accuracy : 0.7642   val accuracy : 0.7669
Training loss : 0.5522   Training accuracy : 0.7674   val accuracy : 0.7708
Training loss : 0.5458   Training accuracy : 0.7701   val accuracy : 0.7734
Training loss : 0.5399   Training accuracy : 0.7730   val accuracy : 0.7729
Training los