#Detect Credit Card Fraud

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
data=pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
data.shape

(284807, 31)

In [4]:
sc = StandardScaler()
data['Amount'] = sc.fit_transform(pd.DataFrame(data['Amount']))

In [5]:
data.duplicated().any()

True

In [6]:
data = data.drop_duplicates()

In [7]:
data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,283253
1,473


In [8]:
X = data.drop('Class', axis = 1)
y=data['Class']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [19]:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
}

def evaluate_classifier(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
    }

for name, clf in classifiers.items():
    print(f"\n{name}--------")
    results = evaluate_classifier(clf, X_train, X_test, y_train, y_test)
    for metric, value in results.items():
        print(f"\n {metric}: {value}")


Logistic Regression--------

 Accuracy: 0.9368421052631579

 Precision: 0.96875

 Recall: 0.9117647058823529

 F1 Score: 0.9393939393939394

Decision Tree Classifier--------

 Accuracy: 0.9210526315789473

 Precision: 0.9484536082474226

 Recall: 0.9019607843137255

 F1 Score: 0.9246231155778895


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
normal = data[data['Class']==0]
fraud = data[data['Class']==1]

In [12]:
normal.shape

(283253, 31)

In [13]:
fraud.shape

(473, 31)

In [14]:
normal_sample = normal.sample(n=473)
normal_sample.shape

(473, 31)

In [15]:
new_data = pd.concat([normal_sample,fraud], ignore_index=True)
new_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,44833.0,-2.029494,1.042687,1.743612,0.8799,-0.870046,1.199494,-0.742611,1.289239,-0.464883,...,0.25136,0.597577,-0.020535,-0.280208,0.080849,-0.251723,-0.411583,-0.160244,-0.221253,0
1,145145.0,2.067086,0.19014,-1.916538,1.21849,0.794384,-0.710138,0.642386,-0.376175,0.244525,...,0.005329,0.234596,-0.103502,-0.767835,0.558622,-0.443312,-0.016774,-0.068247,-0.287381,0
2,159030.0,-0.656225,1.424349,-1.459591,-0.79126,0.752887,-0.003305,0.166431,0.820667,-0.490879,...,0.446506,1.181409,-0.120002,-0.312155,-0.617581,-0.214917,0.118382,0.170305,-0.349231,0
3,84595.0,-1.513752,0.263057,2.50752,-1.552046,-0.992729,0.775044,-0.389999,0.13782,0.929604,...,-0.732457,-0.726435,-0.345974,-0.389566,0.471506,1.04038,0.295581,0.139927,-0.273468,0
4,145651.0,2.2393,-0.738608,-1.57289,-1.219247,-0.092073,-0.322681,-0.562433,-0.111825,-0.722583,...,0.459691,1.314086,-0.026537,0.279138,0.226938,0.056911,-0.039666,-0.075333,-0.353189,0


In [16]:
new_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,473
1,473


In [17]:
X = new_data.drop('Class', axis = 1)
y= new_data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
}

def evaluate_classifier(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
    }

for name, clf in classifiers.items():
    print(f"\n{name}-----------")
    results = evaluate_classifier(clf, X_train, X_test, y_train, y_test)
    for metric, value in results.items():
        print(f"\n {metric}: {value}")


Logistic Regression-----------

 Accuracy: 0.9368421052631579

 Precision: 0.96875

 Recall: 0.9117647058823529

 F1 Score: 0.9393939393939394

Decision Tree Classifier-----------

 Accuracy: 0.9263157894736842

 Precision: 0.94

 Recall: 0.9215686274509803

 F1 Score: 0.9306930693069307


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
