## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import f1_score , roc_auc_score, accuracy_score, precision_score, recall_score, classification_report

## Load data

In [2]:
df = pd.read_csv("output.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,278,CASH_IN,330218.42,20866.0,351084.42,452419.57,122201.15,0
1,15,PAYMENT,11647.08,30370.0,18722.92,0.0,0.0,0
2,10,CASH_IN,152264.21,106589.0,258853.21,201303.01,49038.8,0
3,206,CASH_IN,78172.3,2921331.58,2999503.88,415821.9,337649.6,0
4,45,CASH_OUT,141100.88,80506.0,0.0,89384.09,230484.96,0


## Preprocessing

In [4]:
X = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]

randomState = 5
np.random.seed(randomState)

#target
Y = X['isFraud']
del X['isFraud']

# Encodage binaire des données étiquetées dans 'type'
X.loc[X.type == 'TRANSFER', 'type'] = 0
X.loc[X.type == 'CASH_OUT', 'type'] = 1
X.type = X.type.astype(int) # convertir dtype('O') en dtype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [5]:
Xfraud = X.loc[Y == 1]
XnonFraud = X.loc[Y == 0]
print('\nLa fraction de transactions frauduleuses avec \'oldBalanceDest\' = \
\'newBalanceDest\' = 0 bien que le \'montant\' traité soit différent de zéro est: {}'.\
format(len(Xfraud.loc[(Xfraud.oldbalanceDest == 0) & \
(Xfraud.newbalanceDest == 0) & (Xfraud.amount)]) / (1.0 * len(Xfraud))))

print('\nLa fraction des transactions authentiques avec \'oldBalanceDest\' = \
newBalanceDest\' = 0 bien que le \'montant\' traité soit différent de zéro est: {}'.\
format(len(XnonFraud.loc[(XnonFraud.oldbalanceDest == 0) & \
(XnonFraud.newbalanceDest == 0) & (XnonFraud.amount)]) / (1.0 * len(XnonFraud))))


La fraction de transactions frauduleuses avec 'oldBalanceDest' = 'newBalanceDest' = 0 bien que le 'montant' traité soit différent de zéro est: 0.5309278350515464

La fraction des transactions authentiques avec 'oldBalanceDest' = newBalanceDest' = 0 bien que le 'montant' traité soit différent de zéro est: 0.0010357252264737203


In [6]:
X['errorBalanceOrig'] = X.newbalanceOrig + X.amount - X.oldbalanceOrg
X['errorBalanceDest'] = X.oldbalanceDest + X.amount - X.newbalanceDest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['errorBalanceOrig'] = X.newbalanceOrig + X.amount - X.oldbalanceOrg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['errorBalanceDest'] = X.oldbalanceDest + X.amount - X.newbalanceDest


## Split data

In [7]:
### splitting the train and test sets.
X_train,X_test, y_train, y_test  =  train_test_split(X,Y,test_size=0.3,random_state=69,stratify=Y) 

In [8]:
#OverSampling
smote = SMOTE(random_state=1)
X_train, y_train = smote.fit_resample(X_train,y_train)

In [9]:
y_train.value_counts()

0    47985
1    47985
Name: isFraud, dtype: int64

## Build and train the model

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Create the pipeline with RobustScaler and DecisionTreeClassifier
pipeline_dt = Pipeline(steps=[['scaler', RobustScaler()],
                              ['classifier', DecisionTreeClassifier()]])
                              
# Specify the hyperparameter dictionary
hyperparameters = {
    'classifier__ccp_alpha': 0.01,
    'classifier__criterion': 'entropy',
    'classifier__min_samples_leaf': 1
}

# Set the hyperparameters for the pipeline
pipeline_dt.set_params(**hyperparameters)

# Fit the pipeline on the training data
pipeline_dt.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline_dt.predict(X_test)

In [11]:
print(classification_report(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20566
           1       1.00      0.99      1.00       116

    accuracy                           1.00     20682
   macro avg       1.00      1.00      1.00     20682
weighted avg       1.00      1.00      1.00     20682

0.9956896551724138


In [12]:
import pickle
# save the model to disk
pickle.dump(pipeline_dt, open('model.plk', 'wb'))