In [1]:
!pip -q install pytorch-tabnet

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [7]:
path = '/kaggle/input/insect/insect_symptom_synthetic_dataset.csv'
df = pd.read_csv(path)

In [8]:
df.head()

Unnamed: 0,Is the pest in the image an armyworm?,Is the armyworm green in color?,Is the armyworm brown in color?,Is the armyworm found on the leaf top?,Is the armyworm found on the underside of the leaf?,Is the armyworm present on the stem?,Is the armyworm feeding on the crop?,Are visible bite marks present on the leaf?,Are there multiple armyworms in the image?,Is any frass (armyworm waste) visible near the pest?,...,Is the armyworm moving actively?,Are there signs of curled leaves due to feeding?,Has the armyworm damaged more than one section of the same plant?,Is there visible discoloration of the crop due to pest feeding?,Does the armyworm show striping or lines on its body?,Is the length of the armyworm greater than 20 mm?,Are any dead armyworms seen in the area (possibly due to pesticide)?,Is any chewing sound audible during the inspection?,Has any farmer nearby reported armyworm infestation in the last week?,Insect_Present_Label
0,0,1,1,1,1,0,1,1,0,1,...,1,0,1,0,1,1,0,0,1,1
1,0,0,1,0,1,0,1,1,0,0,...,0,0,0,1,0,0,1,1,0,0
2,1,0,0,0,1,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,1,1,1,0,0,1,0,1,0,0
4,1,0,0,0,1,0,1,1,1,0,...,1,1,0,0,1,1,0,0,0,1


In [9]:
X = df.drop('Insect_Present_Label', axis=1)
y = df['Insect_Present_Label']

# Identify categorical features
# Since all features are 0/1, we can treat them as categorical for TabNet
categorical_features_indices = [i for i, col in enumerate(X.columns)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing set shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training set shape: X_train=(200, 30), y_train=(200,)
Testing set shape: X_test=(50, 30), y_test=(50,)


In [10]:
clf = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5, gamma=1.5,
    cat_idxs=categorical_features_indices,
    cat_dims=[2 for _ in categorical_features_indices],  # All features are binary (0, 1), so 2 unique values
    cat_emb_dim=1,  # Embedding dimension for categorical features
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size": 50, "gamma": 0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax',  # "sparsemax" or "entmax"
    verbose=0  # Set to 1 to see training progress
)

In [12]:
print("\nTraining TabNet model...")

# Convert data to numpy arrays as expected by TabNet
X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values

clf.fit(
    X_train_np, y_train_np,  # Pass X and y as positional arguments
    eval_set=[(X_test_np, y_test.values)],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=32,
    virtual_batch_size=16
)
print("TabNet model training complete.")



Training TabNet model...

Early stopping occurred at epoch 34 with best_epoch = 24 and best_val_0_accuracy = 0.78
TabNet model training complete.




In [13]:
y_pred = clf.predict(X_test_np)

# Evaluate the model
print("\n--- Model Evaluation ---")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision, Recall, F1-Score
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


--- Model Evaluation ---
Accuracy: 0.7800
Precision: 0.8519
Recall: 0.7667
F1-Score: 0.8070
