<a href="https://www.kaggle.com/code/hamidjazayeriy/breast-cancer-detection-100-acc-pytorch-tabnet?scriptVersionId=184722569" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Installing and Importing Required Python Libraries

In [1]:
!pip install pytorch_tabnet

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch_tabnet
Successfully installed pytorch_tabnet-4.1.0




# Reading the dataset

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(f" The path to access data is: \n  {os.path.join(dirname, filename)}")
        
data=pd.read_csv("/kaggle/input/breast-cancer/breast-cancer-wisconsin-data_data.csv")

print(f"There are {data.shape[0]} rows and {data.shape[1]} columns in the dataset {data.shape}")

 The path to access data is: 
  /kaggle/input/breast-cancer/breast-cancer-wisconsin-data_data.csv
There are 569 rows and 33 columns in the dataset (569, 33)


# Preprocessing

In [3]:
##Columns
#data.columns

##Info About Data columns
#data.info()

##Head
#data.head()

#Finding Null Values
#data.isnull().mean()*100

##Finding Duplicates
#data.duplicated().sum()

#Removing unnecessary columns
data.drop(columns={'id','Unnamed: 32'},inplace=True)

# Label and Feature Encoding
Target values should be transform to a binary class {0,1}

In [4]:
Target = data['diagnosis']
Features = data.drop(columns={'diagnosis'})

encoder=LabelEncoder()
Target = encoder.fit_transform(Target)
print("class Labels :", encoder.classes_)

scaler = StandardScaler()
Features = scaler.fit_transform(Features).squeeze()
Features.shape, Target.shape

class Labels : ['B' 'M']


((569, 30), (569,))

# Split Train and Test sets

In [5]:
#Train_Test_Split
X_train,X_test,y_train,y_test=train_test_split (Features,Target,test_size=0.3, random_state=41)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

# Model definition

In [7]:
# define the model
TN_model= TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       scheduler_params={"step_size":10, 
                                         "gamma":0.95},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                      )

# Fitting the Model

In [8]:
# fit the model 
TN_model.fit(
    X_train ,y_train,
    eval_set=[(X_train, y_train), (X_test , y_test)],
    eval_name=['train', 'test'],
    eval_metric=['auc','balanced_accuracy'],
    max_epochs=200, patience=60,
    batch_size=512, virtual_batch_size=512,
    num_workers=0,
    weights=1,
    drop_last=False
)           

epoch 0  | loss: 1.02706 | train_auc: 0.77942 | train_balanced_accuracy: 0.74184 | test_auc: 0.82116 | test_balanced_accuracy: 0.74255 |  0:00:00s
epoch 1  | loss: 0.71808 | train_auc: 0.82398 | train_balanced_accuracy: 0.80441 | test_auc: 0.86736 | test_balanced_accuracy: 0.82988 |  0:00:00s
epoch 2  | loss: 0.63985 | train_auc: 0.84693 | train_balanced_accuracy: 0.80221 | test_auc: 0.87064 | test_balanced_accuracy: 0.83077 |  0:00:00s
epoch 3  | loss: 0.55169 | train_auc: 0.88693 | train_balanced_accuracy: 0.82669 | test_auc: 0.90656 | test_balanced_accuracy: 0.838   |  0:00:00s
epoch 4  | loss: 0.53392 | train_auc: 0.88868 | train_balanced_accuracy: 0.83092 | test_auc: 0.90134 | test_balanced_accuracy: 0.82526 |  0:00:00s
epoch 5  | loss: 0.52709 | train_auc: 0.90983 | train_balanced_accuracy: 0.85429 | test_auc: 0.9377  | test_balanced_accuracy: 0.86714 |  0:00:01s
epoch 6  | loss: 0.39804 | train_auc: 0.93179 | train_balanced_accuracy: 0.86036 | test_auc: 0.95544 | test_balanced_a



# Test the Model

In [9]:
y_pred = TN_model.predict(X_test)
Acc = accuracy_score(y_test, y_pred)

print("\n================= Test Result ========================")  
print(f"Accuracy: %{100*Acc:6.2f}  ")
print("_______________________________________________________\n Classification Report:")
print(classification_report(y_test, y_pred))
print("_______________________________________________________\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: %100.00  
_______________________________________________________
 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        61

    accuracy                           1.00       171
   macro avg       1.00      1.00      1.00       171
weighted avg       1.00      1.00      1.00       171

_______________________________________________________
 Confusion Matrix:
[[110   0]
 [  0  61]]


# Finding the best random state
The best resualt can be gained at random_state = 41