In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [None]:
file_path2 = '/content/data.csv'

In [None]:
data = pd.read_csv(file_path2)

In [None]:
data

Unnamed: 0,ctype_catl,ctypel,age,sex,bmi_curc,cig_stat,pack_years,ph_any_trial,diabetes_f,hyperten_f,...,lung_clinstage,lung_stage_t,lung_stage_n,lung_stage_m,lung_histtype_cat,trt_familyl,trt_numl,neoadjuvant,treatment_period,treatment_category
0,Major,Bronchopulmonary Fistula,67,Male,18.5-25,Former Cigarette Smoker,45.0,No,No,No,...,Stage IA,T2,N0,M0,Adenocarcinoma,Chemotherapy,"Systemic treatment, NOS",Not neoadjuvant,-172,pre-treatment
1,Major,Pulmonary Embolus / Emboli,69,Female,25-30,Never Smoked Cigarettes,0.0,No,No,No,...,Stage IA,T1,N0,M0,Bronchiolo-alveolar carcinoma,Non-curative treatment,Lymphadenectomy / lymph node sampling,Not neoadjuvant,20,post-treatment
2,Intermediate,Infectious,68,Male,30+,Former Cigarette Smoker,49.0,No,No,No,...,Stage IA,T1,N0,M0,Squamous cell carcinoma,Non-curative treatment,Lymphadenectomy / lymph node sampling,Not neoadjuvant,36,post-treatment
3,Major,Bronchopulmonary Fistula,71,Male,25-30,Former Cigarette Smoker,69.0,No,No,No,...,Stage IV,T3,N1,M1,Bronchiolo-alveolar carcinoma,Pneumonectomy or bilobectomy,Pneumonectomy,Not neoadjuvant,10,during treatment
4,Major,Cardiac Emergencies,59,Female,18.5-25,Current Cigarette Smoker,120.0,No,No,No,...,Stage IIIB,T3,N1,M0,Squamous cell carcinoma,Non-curative treatment,Lymphadenectomy / lymph node sampling,Not neoadjuvant,92,post-treatment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26573,Intermediate,Infectious,65,Male,30+,Current Cigarette Smoker,150.0,No,No,Yes,...,Stage IA,T1,N0,M0,Squamous cell carcinoma,Non-curative treatment,Lymphadenectomy / lymph node sampling,Not neoadjuvant,9,during treatment
26574,Intermediate,Cardiac Arrhythmia,63,Male,18.5-25,Current Cigarette Smoker,46.0,No,No,No,...,Stage IA,T1,N0,M0,Squamous cell carcinoma,"Wedge resection, segmental resection, or lobec...",Lobectomy,Not neoadjuvant,0,during treatment
26575,Major,Respiratory Arrest,62,Male,18.5-25,Current Cigarette Smoker,69.0,No,No,No,...,Stage IV,T2,N0,M1,Squamous cell carcinoma,Pneumonectomy or bilobectomy,Pneumonectomy,Not neoadjuvant,17,post-treatment
26576,Major,Cerebral vascular accident (CVA) / Stroke,68,Male,30+,Former Cigarette Smoker,49.0,No,No,No,...,Stage IA,T1,N0,M0,Squamous cell carcinoma,"Wedge resection, segmental resection, or lobec...",Lobectomy,Not neoadjuvant,16,post-treatment


In [None]:

# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Dictionary to store label encoders and mappings
label_encoders = {}
encoded_mappings = {}  # To store category-to-encoded value mappings

# Apply Label Encoding to all categorical features
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoder for later use
    encoded_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))  # Store category mapping

In [None]:

import joblib

# Save the label encoders for future use
joblib.dump(label_encoders, "label_encoders.pkl")


['label_encoders.pkl']

In [None]:

# Print encoded mappings for reference
for col, mapping in encoded_mappings.items():
    print(f"Encoded values for '{col}': {mapping}")

Encoded values for 'ctype_catl': {'Intermediate': np.int64(0), 'Major': np.int64(1), 'Minor': np.int64(2)}
Encoded values for 'ctypel': {'Acute / Chronic Respiratory Failure': np.int64(0), 'Atelectasis': np.int64(1), 'Bleeding & Wound Healing Issues': np.int64(2), 'Bronchopulmonary Fistula': np.int64(3), 'Bronchospasm': np.int64(4), 'Cardiac Arrhythmia': np.int64(5), 'Cardiac Emergencies': np.int64(6), 'Cerebral vascular accident (CVA) / Stroke': np.int64(7), 'Congestive Heart Failure (CHF)': np.int64(8), 'Deep Venous Thrombosis (DVT)': np.int64(9), 'Fever Requiring Antibiotics': np.int64(10), 'Hospitalization': np.int64(11), 'Hypokalemia': np.int64(12), 'Hypotension / Vasovagal Reaction': np.int64(13), 'Infectious': np.int64(14), 'Other Specify': np.int64(15), 'Pain Requiring Referral to an Anesthesiologist / Pain Specialist': np.int64(16), 'Pneumothorax': np.int64(17), 'Pulmonary Embolus / Emboli': np.int64(18), 'Respiratory Arrest': np.int64(19), 'Rib Fracture(s)': np.int64(20), 'Ur

In [None]:
# Step 3: Prepare data for modeling
X = data.drop(columns=['ctype_catl', 'ctypel'])
y = data['ctypel']  # Using 'ctype_catl' as an example target

In [None]:
X

Unnamed: 0,age,sex,bmi_curc,cig_stat,pack_years,ph_any_trial,diabetes_f,hyperten_f,emphys_f,bronchit_f,...,lung_clinstage,lung_stage_t,lung_stage_n,lung_stage_m,lung_histtype_cat,trt_familyl,trt_numl,neoadjuvant,treatment_period,treatment_category
0,67,1,1,1,45.0,0,0,0,0,0,...,1,1,0,0,0,0,13,1,-172,2
1,69,0,2,2,0.0,0,0,0,0,0,...,1,0,0,0,1,1,5,1,20,1
2,68,1,3,1,49.0,0,0,0,0,0,...,1,0,0,0,6,1,5,1,36,1
3,71,1,2,1,69.0,0,0,0,0,0,...,7,2,1,1,1,2,9,1,10,0
4,59,0,1,0,120.0,0,0,0,0,0,...,6,2,1,0,6,1,5,1,92,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26573,65,1,3,0,150.0,0,0,1,0,0,...,1,0,0,0,6,1,5,1,9,0
26574,63,1,1,0,46.0,0,0,0,0,0,...,1,0,0,0,6,4,4,1,0,0
26575,62,1,1,0,69.0,0,0,0,0,0,...,7,1,0,1,6,2,9,1,17,1
26576,68,1,3,1,49.0,0,0,0,0,0,...,1,0,0,0,6,4,4,1,16,1


In [None]:
pip install pytorch-tabnet


Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 

In [None]:
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import torch

# # Label encoding if y is categorical
# if y.dtype == 'object' or y.dtype.name == 'category':
#     le = LabelEncoder()
#     y = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Convert to numpy arrays if not already
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Initialize TabNet classifier
clf = TabNetClassifier(
    n_d=16,               # Decision layer width
    n_a=16,               # Attention layer width
    n_steps=5,            # Number of decision steps (controls depth)
    gamma=1.5,            # Relaxation parameter
    lambda_sparse=1e-4,   # Sparsity regularization
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    seed=42,
    verbose=0,
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)

# Fit model with class weights
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weights = {i: w for i, w in zip(classes, class_weights)}

sample_weights = np.vectorize(weights.get)(y_train)

clf.fit(
    X_train, y_train,
    weights=sample_weights,
    max_epochs=100,
    patience=20,
    batch_size=1024,
    virtual_batch_size=128,
)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           0       0.69      0.55      0.61       174
           1       0.81      0.71      0.76       181
           2       0.82      0.72      0.77       266
           3       0.91      0.95      0.93       179
           4       0.93      1.00      0.96       220
           5       0.71      0.63      0.67       233
           6       0.91      0.71      0.80       231
           7       0.91      1.00      0.95       193
           8       0.83      0.93      0.88       173
           9       0.87      1.00      0.93       261
          10       0.88      0.95      0.92       182
          11       0.81      0.87      0.84       189
          12       0.97      1.00      0.98       280
          13       0.73      0.76      0.74       278
          14       0.85      0.64      0.73       269
          15       1.00      1.00      1.00       298
          16       0.73      0.98      0.84       178
          17       0.72    

In [None]:
from sklearn.metrics import accuracy_score

# Train and Test Accuracy
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Train Accuracy: 0.8843006302323394
Test Accuracy: 0.8717080511662905


In [None]:
import joblib

# Save the model
joblib.dump(clf, "tabnet_clf.pkl")


['tabnet_clf.pkl']

In [None]:
import pickle

# Save the model
with open("/content/label_encoders.pkl", "wb") as file:
    pickle.dump(clf, file)


In [None]:
# Load label encoders
label_encoders = joblib.load("label_encoders.pkl")  # Assuming you saved encoders separately


In [None]:
# Convert to DataFrame and match model input shape
sample_df = pd.DataFrame([sample_input])


In [None]:

# Decode the predicted class if needed
predicted_class = label_encoders['ctypel'].inverse_transform([prediction[0]])

print("Predicted Class:", predicted_class[0])

Predicted Class: Congestive Heart Failure (CHF)
