# Libs and Installs

In [9]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m142.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as mp

# DATA

## ELECTRON DATASET

In [3]:
with h5py.File('SingleElectronPt50_IMGCROPS_n249k_RHv1.hdf5', 'r') as f:
    print("Keys:", list(f.keys()))
    Xe = f['X'][:]
    ye = f['y'][:]


Keys: ['X', 'y']


In [3]:
ye

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [4]:
Xe[0]

array([[[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       ...,

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]]], dtype=float32)

## PHOTON DATASET

In [4]:
with h5py.File('SinglePhotonPt50_IMGCROPS_n249k_RHv1.hdf5', 'r') as f:
    print("Keys:", list(f.keys()))
    Xp = f['X'][:]
    yp = f['y'][:]

Keys: ['X', 'y']


In [6]:
yp

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

## MERGING THE DATASET

In [5]:
X = np.vstack([Xe, Xp])  #indp var
y = np.hstack([ye, yp])  # labels

In [5]:
X.shape

(498000, 32, 32, 2)

In [6]:
y[:20]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.], dtype=float32)

# WE NEED TO SHUFFLE THE DATA AND SPLIT

In [7]:
# test train split 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)


In [9]:
y_train[:10]

array([0., 0., 0., 0., 0., 1., 1., 0., 1., 0.], dtype=float32)

# MODEL

#LOGISTIC REG

In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train.reshape(len(X_train), -1), y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

X_test_flat = X_test.reshape(len(X_test), -1)

y_pred = clf.predict(X_test_flat)

y_prob = clf.predict_proba(X_test_flat)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("Test Accuracy:", acc)
print("Test ROC AUC:", auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.6092670682730924
Test ROC AUC: 0.6563312218759776

Classification Report:
               precision    recall  f1-score   support

         0.0       0.60      0.66      0.63     49796
         1.0       0.62      0.56      0.59     49804

    accuracy                           0.61     99600
   macro avg       0.61      0.61      0.61     99600
weighted avg       0.61      0.61      0.61     99600



In [10]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

X_flat = X_train.reshape(len(X_train), -1)
X_test_flat = X_test.reshape(len(X_test), -1)

model = lgb.LGBMClassifier()
model.fit(X_flat, y_train)

y_prob = model.predict_proba(X_test_flat)[:, 1]
y_pred = model.predict(X_test_flat)

print("AUC:", roc_auc_score(y_test, y_prob))


[LightGBM] [Info] Number of positive: 199196, number of negative: 199204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.617474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263918
[LightGBM] [Info] Number of data points in the train set: 398400, number of used features: 2047
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499990 -> initscore=-0.000040
[LightGBM] [Info] Start training from score -0.000040
AUC: 0.7785745695461336
