In [1]:
!pip install lightning



In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.classification import BinaryAccuracy

import pytorch_lightning as pl
from torch.utils.data import TensorDataset, DataLoader
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt
import numpy as np
import pathlib
import shutil
import tempfile
import pandas as pd
import seaborn as sns
import copy
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [3]:
torch.cuda.is_available()

True

In [4]:
!pip install xgboost lightgbm catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier



In [5]:
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)

In [6]:
def evaluate_binary_classification(model, dataloader, test = False):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_targets = []

    # Iterate over batches in the dataloader
    for batch in dataloader:
        inputs, targets = batch

        # Forward pass
        with torch.no_grad():
            outputs = model(inputs)

        # Convert logits to probabilities (assuming sigmoid activation for binary classification)
        probabilities = torch.sigmoid(outputs)

        # Convert probabilities to binary predictions
        predicted_labels = (probabilities > 0.5).float()

        # Append predictions and targets to lists
        all_predictions.extend(predicted_labels.tolist())
        all_targets.extend(targets.tolist())

    # Convert predictions and targets to tensors
    predictions_tensor = torch.tensor(all_predictions)
    targets_tensor = torch.tensor(all_targets)

    # Calculate accuracy
    accuracy = accuracy_score(targets_tensor, predictions_tensor)
    
    if test:
        # Print classification report
        print(classification_report(targets_tensor, predictions_tensor))

    return accuracy

In [7]:
class TinyModel(nn.Module):
    def __init__(self, input_size):
        super(TinyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 16)
        self.fc2 = nn.Linear(16, 1)
        self.activation = nn.ELU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.fc2(x)
        return x

class SmallModel(nn.Module):
    def __init__(self, input_size):
        super(SmallModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 1)
        self.activation = nn.ELU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.fc3(x)
        return x

class MediumModel(nn.Module):
    def __init__(self, input_size):
        super(MediumModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 1)
        self.activation = nn.ELU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x

class LargeModel(nn.Module):
    def __init__(self, input_size):
        super(LargeModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 512)
        self.fc5 = nn.Linear(512, 1)
        self.activation = nn.ELU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.fc5(x)
        return x

class DropoutModel(nn.Module):
    def __init__(self, input_size):
        super(DropoutModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 512)
        self.fc5 = nn.Linear(512, 1)
        self.activation = nn.ELU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.dropout(x)
        x = self.activation(self.fc2(x))
        x = self.dropout(x)
        x = self.activation(self.fc3(x))
        x = self.dropout(x)
        x = self.activation(self.fc4(x))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

In [8]:
pd.options.mode.chained_assignment = None  # default='warn'

In [9]:
column_names = ["Y"] + [str(i) for i in range(28)]

In [10]:
df = pd.read_csv('HIGGS.csv.gz', header = None, names = column_names)
df

Unnamed: 0,Y,0,1,2,3,4,5,6,7,8,...,18,19,20,21,22,23,24,25,26,27
0,1.0,0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.000000,...,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.497970,-0.313010,1.095531,-0.557525,-1.588230,2.173076,...,-1.138930,-0.000819,0.000000,0.302220,0.833048,0.985700,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.000000,...,1.128848,0.900461,0.000000,0.909753,1.108330,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.992050,0.882454,1.786066,-1.646778,-0.942383,0.000000,...,-0.678379,-1.360356,0.000000,0.946652,1.028704,0.998656,0.728281,0.869200,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.000000,...,-0.373566,0.113041,0.000000,0.755856,1.361057,0.986610,0.838085,1.133295,0.872245,0.808487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10999995,1.0,1.159912,1.013847,0.108615,1.495524,-0.537545,2.342396,-0.839740,1.320683,0.000000,...,-0.097068,1.190680,3.101961,0.822136,0.766772,1.002191,1.061233,0.837004,0.860472,0.772484
10999996,1.0,0.618388,-1.012982,1.110139,0.941023,-0.379199,1.004656,0.348535,-1.678593,2.173076,...,-0.216995,1.049177,3.101961,0.826829,0.989809,1.029104,1.199679,0.891481,0.938490,0.865269
10999997,1.0,0.700559,0.774251,1.520182,0.847112,0.211230,1.095531,0.052457,0.024553,2.173076,...,1.585235,1.713962,0.000000,0.337374,0.845208,0.987610,0.883422,1.888438,1.153766,0.931279
10999998,0.0,1.178030,0.117796,-1.276980,1.864457,-0.584370,0.998519,-1.264549,1.276333,0.000000,...,1.399515,-1.313189,0.000000,0.838842,0.882890,1.201380,0.939216,0.339705,0.759070,0.719119


In [11]:
df.describe()

Unnamed: 0,Y,0,1,2,3,4,5,6,7,8,...,18,19,20,21,22,23,24,25,26,27
count,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,...,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0
mean,0.5299203,0.9914658,-8.297618e-06,-1.327225e-05,0.9985364,2.613459e-05,0.9909152,-2.02752e-05,7.716199e-06,0.9999687,...,-5.756954e-06,1.744903e-05,1.0,1.03429,1.024805,1.050554,1.009742,0.9729596,1.033036,0.959812
std,0.499104,0.5653777,1.008827,1.006346,0.6000185,1.006326,0.4749747,1.009303,1.005901,1.027808,...,1.007694,1.006366,1.400209,0.6746354,0.3808074,0.1645763,0.3974453,0.5254063,0.3652556,0.3133378
min,0.0,0.2746966,-2.434976,-1.742508,0.0002370088,-1.743944,0.1375024,-2.969725,-1.741237,0.0,...,-2.497265,-1.742691,0.0,0.07507046,0.1986757,0.08304866,0.1320062,0.04786215,0.2951122,0.3307214
25%,0.0,0.5907533,-0.7383225,-0.8719308,0.5768156,-0.8712081,0.6789927,-0.687245,-0.8680962,0.0,...,-0.7141902,-0.8714789,0.0,0.7906095,0.8462266,0.9857525,0.7675732,0.6738168,0.8193964,0.7703901
50%,1.0,0.8533714,-5.415563e-05,-0.0002410638,0.8916277,0.0002125454,0.8948193,-2.543566e-05,5.813991e-05,1.086538,...,0.000372133,-0.0002642369,0.0,0.8949304,0.9506853,0.9897798,0.916511,0.8733798,0.9473447,0.8719701
75%,1.0,1.236226,0.7382142,0.870994,1.293056,0.8714708,1.17074,0.6871941,0.8683126,2.173076,...,0.7141017,0.8716055,3.101961,1.02473,1.083493,1.020528,1.142226,1.138439,1.140458,1.059248
max,1.0,12.09891,2.434868,1.743236,15.39682,1.743257,9.940391,2.969674,1.741454,2.173076,...,2.498009,1.743372,3.101961,40.19237,20.37278,7.992739,14.26244,17.76285,11.49652,8.374498


In [12]:
#sns.pairplot(data=df, hue="Y")

In [13]:
train_data = df[:-500000]
test_data = df[-500000:]

In [14]:
len(train_data), len(test_data)

(10500000, 500000)

In [15]:
Y_train = train_data.pop("Y").values.reshape(-1, 1)
X_train = train_data.values

Y_test = test_data.pop("Y").values.reshape(-1, 1)
X_test = test_data.values

In [None]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 512  # Adjust batch size according to your needs

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [17]:
FEATURES = X_train.shape[1]
X_train.shape

(10500000, 28)

In [18]:
Y_test.shape

(500000, 1)

# Defining the Models

In [19]:
class NeuralNet(pl.LightningModule):
    def __init__(self, model):
        super(NeuralNet, self).__init__()
        self.model = model

        self.accuracy = BinaryAccuracy()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.model(x)
        loss = F.binary_cross_entropy_with_logits(y_pred, y.view(-1, 1))
        self.log('train_loss', loss)
        self.log('train_acc', self.accuracy(y_pred.sigmoid(), y.view(-1, 1)))
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.model(x)
        loss = F.binary_cross_entropy_with_logits(y_pred, y.view(-1, 1))
        self.log('val_loss', loss)
        self.log('val_acc', self.accuracy(y_pred.sigmoid(), y.view(-1, 1)))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [20]:
# Usage:
input_size = FEATURES  # Assuming FEATURES is defined elsewhere in your code
tiny_model = TinyModel(input_size)
small_model = SmallModel(input_size)
medium_model = MediumModel(input_size)
large_model = LargeModel(input_size)
dropout_model = DropoutModel(input_size)

neuralNetsModels = {
    "tiny_model": NeuralNet(tiny_model),
    "small_model": NeuralNet(small_model),
    "medium_model": NeuralNet(medium_model),
    "large_model": NeuralNet(large_model),
    "dropout_model": NeuralNet(dropout_model)
}

In [21]:
dtModels = {}

rf_model = RandomForestClassifier(random_state=42)
dtModels["RandomForest"] = rf_model

dt_model = DecisionTreeClassifier(random_state=42)
dtModels["DecisionTree"] = dt_model

xgb_model = xgb.XGBClassifier(random_state=42)
dtModels["XGBoost"] = xgb_model

lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
dtModels["LGBM"] = lgb_model

cat_model = CatBoostClassifier(random_state=42, verbose=False)
dtModels["CatBoost"] = cat_model

In [22]:
param_grid = {}

dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid["DecisionTree"] = dt_param_grid

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4]
}
param_grid["RandomForest"] = rf_param_grid

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2]
}
param_grid["XGBoost"] = xgb_param_grid

lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.1, 0.2]
}
param_grid["LGBM"] = lgbm_param_grid

catboost_param_grid = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}
param_grid["CatBoost"] = catboost_param_grid

# Numero di campioni e performance


In [23]:
size_histories = {}

In [24]:
sizes = [1000, 10000, 100000, 1000000, len(train_data)]

In [25]:
columns = pd.MultiIndex.from_product([sizes, ["Train", "Validation", "Test"]], names = ["size", "split"])
results = pd.DataFrame(columns = columns, index = list(dtModels.keys()) + list(neuralNetsModels.keys()))
results

size,1000,1000,1000,10000,10000,10000,100000,100000,100000,1000000,1000000,1000000,10500000,10500000,10500000
split,Train,Validation,Test,Train,Validation,Test,Train,Validation,Test,Train,Validation,Test,Train,Validation,Test
RandomForest,,,,,,,,,,,,,,,
DecisionTree,,,,,,,,,,,,,,,
XGBoost,,,,,,,,,,,,,,,
LGBM,,,,,,,,,,,,,,,
CatBoost,,,,,,,,,,,,,,,
tiny_model,,,,,,,,,,,,,,,
small_model,,,,,,,,,,,,,,,
medium_model,,,,,,,,,,,,,,,
large_model,,,,,,,,,,,,,,,
dropout_model,,,,,,,,,,,,,,,


In [29]:
for size in sizes:
    X_training, X_val, y_training, y_val = train_test_split(X_train[:size], Y_train[:size], test_size=0.2, random_state=42)

    for modelName in dtModels:
        start_time = time.time()
        
        y_training = y_training.ravel()

        # GridSearchCV with cross-validation for each model
        grid_search = GridSearchCV(dtModels[modelName], param_grid[modelName], cv=5, scoring='accuracy')
        grid_search.fit(X_training, y_training)

        # Print the best parameters
        print(f"Best Parameters for {modelName}: {grid_search.best_params_}")
        
        # Using the best estimator found by the grid search
        best_model = grid_search.best_estimator_
        
        best_model.fit(X_training, y_training)
    
        predictionsTrain = best_model.predict(X_training)
        predictionsVal = best_model.predict(X_val)
        predictionsTest = best_model.predict(X_test)
        
        results[size]["Train"][modelName] = accuracy_score(y_training, predictionsTrain)
        results[size]["Validation"][modelName] = accuracy_score(y_val, predictionsVal)
        results[size]["Test"][modelName] = accuracy_score(Y_test, predictionsTest)
        
        print(modelName)
        print("--- %s seconds ---" % (time.time() - start_time))
        print(classification_report(Y_test, predictionsTest))
        #print(classification_report(Y_test, binarized_predictionsTest))

Best Parameters for RandomForest: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 300}
RandomForest
              precision    recall  f1-score   support

         0.0       0.66      0.56      0.61    235493
         1.0       0.66      0.74      0.70    264507

    accuracy                           0.66    500000
   macro avg       0.66      0.65      0.65    500000
weighted avg       0.66      0.66      0.65    500000

Best Parameters for DecisionTree: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
DecisionTree
              precision    recall  f1-score   support

         0.0       0.56      0.53      0.55    235493
         1.0       0.60      0.63      0.62    264507

    accuracy                           0.59    500000
   macro avg       0.58      0.58      0.58    500000
weighted avg       0.58      0.59      0.58    500000

Best Parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200}
XGBoost
              precision  

Traceback (most recent call last):
  File "C:\Users\loris\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\loris\AppData\Local\Temp\ipykernel_12608\1118013885.py", line 9, in <module>
    grid_search.fit(X_training, y_training)
  File "C:\Users\loris\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py", line 874, in fit
  File "C:\Users\loris\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py", line 1388, in _run_search
    }
      
  File "C:\Users\loris\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py", line 821, in evaluate_candidates
    return_times=True,
              ^^^^^^^^
  File "C:\Users\loris\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    for delayed_func, args, kwargs in iterable
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\loris\anaconda3\Lib\site-packages\joblib\parallel.py", 

In [None]:
results

In [None]:
for size in sizes:
    X_training, X_val, y_training, y_val = train_test_split(X_train[:size], Y_train[:size], test_size=0.2, random_state=42)

    # Combine X and Y datasets
    X_training_torch = torch.tensor(X_training, dtype=torch.float32)
    y_training_torch = torch.tensor(y_training, dtype=torch.float32)
    X_val_torch = torch.tensor(X_val, dtype=torch.float32)
    y_val_torch = torch.tensor(y_val, dtype=torch.float32)
    
    # Assuming you have your dataset loaded as tensors: train_x, train_y, val_x, val_y
    train_dataset = TensorDataset(X_training_torch, y_training_torch)
    val_dataset = TensorDataset(X_val_torch, y_val_torch)

    batch_size = 512  # Adjust batch size according to your needs

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, persistent_workers=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=8, persistent_workers=True)

    for modelName in neuralNetsModels:
        print(modelName)
        start_time = time.time()

        # Define early stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_acc',  # Monitor validation loss
            min_delta=0.001,     # Minimum change in validation loss to qualify as an improvement
            patience=20,          # Number of epochs with no improvement after which training will be stopped
            verbose=True,        # Print messages about early stopping
            mode='max'           # Minimize validation loss
        )

        # Define ModelCheckpoint callback to save the best model
        checkpoint_callback = ModelCheckpoint(
            dirpath='checkpoints',
            filename='modelName+str(size)',
            monitor='val_acc',
            mode='max',
            save_top_k=1
        )
        
        model_cloned = copy.deepcopy(neuralNetsModels[modelName])
        
        # Define TensorBoard logger
        logger = TensorBoardLogger('logs', name=modelName+str(size))

        # Create PyTorch Lightning trainer with TensorBoard logger
        trainer = pl.Trainer(accelerator="gpu", max_epochs=300, logger=logger, callbacks=[early_stopping_callback, checkpoint_callback])

        # Train the model
        trainer.fit(model_cloned, train_dataloader, val_dataloader)

        print("--- %s seconds ---" % (time.time() - start_time))
        
        results[size]["Train"][modelName] = evaluate_binary_classification(model_cloned, train_dataloader)
        results[size]["Validation"][modelName] = evaluate_binary_classification(model_cloned, val_dataloader)
        results[size]["Test"][modelName] = evaluate_binary_classification(model_cloned, test_dataloader, test = True)

        #print(classification_report(Y_test, binarized_predictionsTest))

In [None]:
results

In [None]:
results.to_csv("result.csv")

In [None]:
plotter = tfdocs.plots.HistoryPlotter(metric = 'binary_crossentropy', smoothing_std=10)
plotter.plot(size_histories)
plt.ylim([0.5, 0.7])

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Open an embedded TensorBoard viewer
%tensorboard --logdir {logdir}