In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('card_credit_fraud.csv')
df.head(10)

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,0,1,PAYMENT,983964,C1231006815,170136,16029636,M1979787155,0,0,0
1,1,1,PAYMENT,186428,C1666544295,21249,1938472,M2044282225,0,0,0
2,2,1,TRANSFER,181,C1305486145,181,0,C553264065,0,0,1
3,3,1,CASH_OUT,181,C840083671,181,0,C38997010,21182,0,1
4,4,1,PAYMENT,1166814,C2048537720,41554,2988586,M1230701703,0,0,0
5,5,1,PAYMENT,781771,C90045638,53860,4604229,M573487274,0,0,0
6,6,1,PAYMENT,710777,C154988899,183195,17608723,M408069119,0,0,0
7,7,1,PAYMENT,786164,C1912850431,17608723,16822559,M633326333,0,0,0
8,8,1,PAYMENT,402436,C1265012928,2671,0,M1176932104,0,0,0
9,9,1,DEBIT,533777,C712410124,41720,3638223,C195600860,41898,4034879,0


In [3]:
df.drop(['transactionId', 'nameOrig', 'nameDest'],axis='columns',inplace=True)

In [4]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,983964,170136,16029636,0,0,0
1,1,PAYMENT,186428,21249,1938472,0,0,0
2,1,TRANSFER,181,181,0,0,0,1
3,1,CASH_OUT,181,181,0,21182,0,1
4,1,PAYMENT,1166814,41554,2988586,0,0,0


In [5]:
df.type = df.type.map({'CASH_OUT':0,'PAYMENT':1,'CASH_IN':2,'TRANSFER':3,'DEBIT':4})

In [6]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,1,983964,170136,16029636,0,0,0
1,1,1,186428,21249,1938472,0,0,0
2,1,3,181,181,0,0,0,1
3,1,0,181,181,0,21182,0,1
4,1,1,1166814,41554,2988586,0,0,0


In [7]:
tran_type = df['type'].value_counts()
tran_type

type
0    366632
1    329867
2    218938
3     86301
4      6475
Name: count, dtype: int64

In [8]:
# séparer les données en: entrées et sorties
X = df.iloc[:,:-1] #les caractéristiques
y = df.iloc[:,-1]  #les résulats (classes)



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [10]:
classifier = LogisticRegression()
# entraîner le modèle 
classifier.fit(X_train,y_train)

In [11]:
y_pred = classifier.predict(X_test)
print(y_pred) # modèle qui a prédit 

[0 0 0 ... 0 0 0]


In [12]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9900351777401608

In [13]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[299446    653]
 [  2361      4]]


In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    300099
           1       0.01      0.00      0.00      2365

    accuracy                           0.99    302464
   macro avg       0.50      0.50      0.50    302464
weighted avg       0.98      0.99      0.99    302464



In [15]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X,y,test_size=0.3,random_state=42)

In [16]:
from imblearn.over_sampling import SMOTE
seed = 5
smote = SMOTE(sampling_strategy='auto', random_state=seed,k_neighbors=7)
X_train_smote, y_train_smote = smote.fit_resample (X_train,y_train)

In [17]:
classifier.fit(X_train_smote,y_train_smote)

In [18]:
y_pred_smote = classifier.predict(X_test_smote)
print(y_pred_smote) # modèle qui a prédit

[0 0 0 ... 0 0 0]


In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_smote,y_pred_smote)
accuracy

0.6810132776132035

In [20]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test_smote,y_pred_smote))

[[203768  96331]
 [   151   2214]]


In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test_smote,y_pred_smote))

              precision    recall  f1-score   support

           0       1.00      0.68      0.81    300099
           1       0.02      0.94      0.04      2365

    accuracy                           0.68    302464
   macro avg       0.51      0.81      0.43    302464
weighted avg       0.99      0.68      0.80    302464

