In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import KFold
import torch

In [2]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

data.head()


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,17,CASH_IN,336058.9,C1401303763,10553134.58,10889193.47,C691717464,635888.08,299829.18,0,0
1,540,CASH_IN,166351.66,C1991628344,589271.93,755623.59,C191462571,2212715.1,2046363.43,0,0
2,346,CASH_IN,56937.15,C1893138634,7076.0,64013.15,C571753084,1238133.1,1181195.94,0,0
3,400,CASH_OUT,40887.55,C80769932,0.0,0.0,C21991437,2278589.76,2319477.3,0,0
4,134,CASH_OUT,41289.13,C104957723,114781.0,73491.87,C1026434684,0.0,41289.13,0,0


In [3]:
data_new = data.head(100000)
data_new.info()
test_new = test_data.head(10000)
test_new.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            100000 non-null  int64  
 1   type            100000 non-null  object 
 2   amount          100000 non-null  float64
 3   nameOrig        100000 non-null  object 
 4   oldbalanceOrg   100000 non-null  float64
 5   newbalanceOrig  100000 non-null  float64
 6   nameDest        100000 non-null  object 
 7   oldbalanceDest  100000 non-null  float64
 8   newbalanceDest  100000 non-null  float64
 9   isFraud         100000 non-null  int64  
 10  isFlaggedFraud  100000 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 8.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            10000 non-n

In [4]:
#converting Categorical Feature into Numerical Feature
le = LabelEncoder()

#for train data
type_le = le.fit(data_new['type'])
data_new['type'] = type_le.transform(data_new['type'])

name_orig = le.fit(data_new['nameOrig'])
data_new['nameOrig'] = name_orig.transform(data_new['nameOrig'])

name_dest = le.fit(data_new['nameDest'])
data_new['nameDest'] = name_dest.transform(data_new['nameDest'])


"""
#for test data 
test_new['type'] = type_le.fit_transform(test_new['type'])
test_new['nameOrig'] = name_orig.transform(test_new['nameOrig'])
test_new['nameDest'] = name_dest.transform(test_new['nameDest'])
"""

#for test data
type_le = le.fit(test_new['type'])
test_new['type'] = type_le.transform(test_new['type'])

name_orig = le.fit(test_new['nameOrig'])
test_new['nameOrig'] = name_orig.transform(test_new['nameOrig'])

name_dest = le.fit(test_new['nameDest'])
test_new['nameDest'] = name_dest.transform(test_new['nameDest'])

#splitting data into train and test
X = data_new.loc[:, data_new.columns != 'isFraud']
y = data_new.loc[:, data_new.columns == 'isFraud']

X.shape, y.shape

#data_new.info()
#test_new.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_new['type'] = type_le.transform(data_new['type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_new['nameOrig'] = name_orig.transform(data_new['nameOrig'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_new['nameDest'] = name_dest.transform(data_new['nameDest'])
A value is trying to

((100000, 10), (100000, 1))

In [5]:
X = X.to_numpy()
y = y.to_numpy()

y= y.flatten()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #model
    tab_clf = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=1e-3),
                            scheduler_params={"step_size":10, "gamma":0.9}, # how to use learning rate scheduler
                            scheduler_fn=torch.optim.lr_scheduler.StepLR,
                            mask_type='entmax' # "sparsemax"
                            )
    
    tab_clf.fit(X_train, y_train,
                eval_set=[(X_train, y_train), (X_test, y_test)],
                eval_name=['train', 'valid'],
                eval_metric=['accuracy'],
                max_epochs=20, patience=20,
                batch_size = 512,
                drop_last=False
                )
    cv_scores.append(tab_clf.best_cost)

print(cv_scores)



epoch 0  | loss: 0.20875 | train_accuracy: 0.9985  | valid_accuracy: 0.9988  |  0:00:10s
epoch 1  | loss: 0.02643 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:00:20s
epoch 2  | loss: 0.01618 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:00:31s
epoch 3  | loss: 0.01358 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:00:41s
epoch 4  | loss: 0.01278 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:00:52s
epoch 5  | loss: 0.01205 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:01:03s
epoch 6  | loss: 0.01196 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:01:14s
epoch 7  | loss: 0.01149 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:01:24s
epoch 8  | loss: 0.01124 | train_accuracy: 0.99851 | valid_accuracy: 0.99885 |  0:01:34s
epoch 9  | loss: 0.01058 | train_accuracy: 0.99852 | valid_accuracy: 0.9988  |  0:01:45s
epoch 10 | loss: 0.00988 | train_accuracy: 0.99855 | valid_accuracy: 0.9989  |  0:01:56s
epoch 11 | loss: 0.00



epoch 0  | loss: 0.20895 | train_accuracy: 0.99852 | valid_accuracy: 0.9986  |  0:00:09s
epoch 1  | loss: 0.0261  | train_accuracy: 0.99858 | valid_accuracy: 0.9986  |  0:00:19s
epoch 2  | loss: 0.01575 | train_accuracy: 0.99858 | valid_accuracy: 0.9986  |  0:00:28s
epoch 3  | loss: 0.01321 | train_accuracy: 0.99858 | valid_accuracy: 0.9986  |  0:00:38s
epoch 4  | loss: 0.01197 | train_accuracy: 0.99858 | valid_accuracy: 0.9986  |  0:00:48s
epoch 5  | loss: 0.01182 | train_accuracy: 0.99858 | valid_accuracy: 0.9986  |  0:00:57s
epoch 6  | loss: 0.01137 | train_accuracy: 0.99858 | valid_accuracy: 0.99865 |  0:01:07s
epoch 7  | loss: 0.01027 | train_accuracy: 0.9986  | valid_accuracy: 0.9986  |  0:01:17s
epoch 8  | loss: 0.0099  | train_accuracy: 0.99864 | valid_accuracy: 0.99855 |  0:01:27s
epoch 9  | loss: 0.00965 | train_accuracy: 0.99865 | valid_accuracy: 0.99865 |  0:01:36s
epoch 10 | loss: 0.00913 | train_accuracy: 0.9987  | valid_accuracy: 0.9986  |  0:01:46s
epoch 11 | loss: 0.00



epoch 0  | loss: 0.21407 | train_accuracy: 0.99861 | valid_accuracy: 0.99835 |  0:00:10s
epoch 1  | loss: 0.02642 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:00:20s
epoch 2  | loss: 0.01561 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:00:31s
epoch 3  | loss: 0.01323 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:00:40s
epoch 4  | loss: 0.01243 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:00:50s
epoch 5  | loss: 0.01181 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:00:59s
epoch 6  | loss: 0.01124 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:01:09s
epoch 7  | loss: 0.01103 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:01:18s
epoch 8  | loss: 0.01078 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:01:27s
epoch 9  | loss: 0.01016 | train_accuracy: 0.99862 | valid_accuracy: 0.9984  |  0:01:37s
epoch 10 | loss: 0.01023 | train_accuracy: 0.99864 | valid_accuracy: 0.99845 |  0:01:46s
epoch 11 | loss: 0.00

KeyboardInterrupt: 