# hybrid-nn-lgbm-agri-insights

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

2025-05-18 04:09:28.601985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747541368.792776      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747541368.847285      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


#### Ensure TensorFlow is using GPU

In [2]:

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# TensorFlow should automatically use GPU if available


Num GPUs Available:  1


#### Load datasets

In [3]:
train_df = pd.read_csv('/kaggle/input/hackathon/train.csv')
test_df = pd.read_csv('/kaggle/input/hackathon/test.csv')

#### Map target labels to numeric values for modeling

In [4]:
label_mapping = {'low': 0, 'medium': 1, 'high': 2}
train_df['Target'] = train_df['Target'].map(label_mapping)

#### Separate features and target

In [5]:
X = train_df.drop(columns=['UID', 'Target'])
y = train_df['Target']
test_uid = test_df['UID']
X_test = test_df.drop(columns=['UID'])

#### One-hot encode categorical features

In [6]:
categorical_features = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_features)
X_test = pd.get_dummies(X_test, columns=categorical_features)

#### Align train and test features to ensure same shape

In [7]:
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

## Train a simple neural network (NN) for feature selection

#### Split data into train/validation sets (80/20 split)

In [8]:
X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(X, y, test_size=0.2, random_state=42)


#### Normalize data

In [9]:
scaler = StandardScaler()
X_train_nn = scaler.fit_transform(X_train_nn)
X_val_nn = scaler.transform(X_val_nn)


#### Define a simple neural network model (using GPU)

In [10]:
model_nn = Sequential([
    Dense(64, input_dim=X_train_nn.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes for multiclass classification
])

model_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1747541381.810090      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


#### Train the model on GPU (if available)

In [11]:
model_nn.fit(X_train_nn, y_train_nn, epochs=10, batch_size=64, validation_data=(X_val_nn, y_val_nn), verbose=1)


Epoch 1/10


I0000 00:00:1747541384.889425      90 service.cc:148] XLA service 0x7d09c400b940 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747541384.889955      90 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1747541385.107240      90 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 118/1408[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 1ms/step - accuracy: 0.1933 - loss: nan

I0000 00:00:1747541385.831593      90 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.1995 - loss: nan - val_accuracy: 0.2010 - val_loss: nan
Epoch 2/10
[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1997 - loss: nan - val_accuracy: 0.2010 - val_loss: nan
Epoch 3/10
[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1975 - loss: nan - val_accuracy: 0.2010 - val_loss: nan
Epoch 4/10
[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2011 - loss: nan - val_accuracy: 0.2010 - val_loss: nan
Epoch 5/10
[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2005 - loss: nan - val_accuracy: 0.2010 - val_loss: nan
Epoch 6/10
[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2004 - loss: nan - val_accuracy: 0.2010 - val_loss: nan
Epoch 7/10
[1m1408/1408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7d0aa64a7590>

#### Get the weights of the first layer (input layer) which corresponds to features

In [12]:
input_weights = model_nn.layers[0].get_weights()[0]  # Shape: (input_dim, number_of_neurons_in_first_layer)


#### Get absolute value of weights and rank features

In [13]:
abs_weights = np.abs(input_weights)
mean_weights = np.mean(abs_weights, axis=1)  # Mean magnitude of weights per feature


#### Sort features by weight importance (ascending order)

In [14]:
feature_importance = np.argsort(mean_weights)[::-1]  # Higher values are more important


## Select top N features (e.g., top 40 most important features)

In [15]:
N = 40
selected_features = feature_importance[:N]

#### Filter the dataset to keep only the top N features

In [16]:
X_selected = X.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

## Define model parameters for LightGBM with GPU

In [17]:
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'class_weight': 'balanced',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'device': 'gpu',  
    'gpu_platform_id': 0,  
    'gpu_device_id': 0,  
    'max_bin': 255,  # GPU optimization setting
}

#### Initialize LightGBM model with GPU support

In [18]:
model_lgb = lgb.LGBMClassifier(**params)

## Cross-validation with LightGBM

In [19]:
f1_scores = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in skf.split(X_selected, y):
    X_train_cv, X_val_cv = X_selected.iloc[train_index], X_selected.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]
    
    # Train and predict for each fold
    model_lgb.fit(X_train_cv, y_train_cv)
    y_val_pred = model_lgb.predict(X_val_cv)
    f1 = f1_score(y_val_cv, y_val_pred, average='macro')
    f1_scores.append(f1)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3780
[LightGBM] [Info] Number of data points in the train set: 90055, number of used features: 39
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 24 dense feature groups (2.06 MB) transferred to GPU in 0.003045 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3767
[LightGBM] [Info] Number of data points in the train set: 90055, number of used features: 39
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 24 dense feature groups (2.06 MB) transferred to GPU in 0.002971 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 

#### Output average F1-score across folds

In [20]:
print(f'Average F1-Score across folds: {np.mean(f1_scores):.4f}')

Average F1-Score across folds: 0.4369


## Train final model on the entire dataset

In [21]:
model_lgb.fit(X_selected, y)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3826
[LightGBM] [Info] Number of data points in the train set: 112569, number of used features: 39
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 24 dense feature groups (2.58 MB) transferred to GPU in 0.003387 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


## Make predictions on the test set

In [22]:
test_pred = model_lgb.predict(X_test_selected)
test_pred = [list(label_mapping.keys())[pred] for pred in test_pred]  # Convert numeric labels back to strings

## Create the submission file

In [23]:
submission = pd.DataFrame({'UID': test_uid, 'Target': test_pred})
submission.to_csv('output.csv', index=False)
print("Submission file created.")

Submission file created.
