<a href="https://colab.research.google.com/github/MuhammadUsman-Khan/Credit_Card_Fraud_Detection/blob/main/Credit_Card_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# importing the dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from warnings import filterwarnings
filterwarnings("ignore")

In [33]:
# loading the dataset
df = pd.read_csv('/content/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [34]:
# Analyzing the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166510 entries, 0 to 166509
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    166510 non-null  float64
 1   V1      166510 non-null  float64
 2   V2      166510 non-null  float64
 3   V3      166510 non-null  float64
 4   V4      166510 non-null  float64
 5   V5      166510 non-null  float64
 6   V6      166510 non-null  float64
 7   V7      166510 non-null  float64
 8   V8      166509 non-null  float64
 9   V9      166509 non-null  float64
 10  V10     166509 non-null  float64
 11  V11     166509 non-null  float64
 12  V12     166509 non-null  float64
 13  V13     166509 non-null  float64
 14  V14     166509 non-null  float64
 15  V15     166509 non-null  float64
 16  V16     166509 non-null  float64
 17  V17     166509 non-null  float64
 18  V18     166509 non-null  float64
 19  V19     166509 non-null  float64
 20  V20     166509 non-null  float64
 21  V21     16

In [35]:
# Checking the null values
df.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,1
V9,1


In [36]:
# Checking the size of the dataset
df.shape

(166510, 31)

In [37]:
# since all values are non-null so we have to proceed to analyze the dataset more
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,166510.0,166510.0,166510.0,166510.0,166510.0,166510.0,166510.0,166510.0,166509.0,166509.0,...,166509.0,166509.0,166509.0,166509.0,166509.0,166509.0,166509.0,166509.0,166509.0,166509.0
mean,59562.386956,-0.183809,0.047626,0.523413,0.125865,-0.188051,0.06141,-0.086538,0.036431,0.020677,...,-0.031555,-0.089674,-0.023931,0.009833,0.097998,0.013392,0.002173,0.002487,86.912851,0.00215
std,26520.811342,1.845073,1.607238,1.371207,1.367585,1.334149,1.292984,1.205265,1.231459,1.155562,...,0.743529,0.66263,0.581129,0.598004,0.460916,0.491248,0.391714,0.308553,243.303381,0.046319
min,0.0,-56.40751,-72.715728,-33.680984,-5.519697,-42.147898,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-11.710896,0.0,0.0
25%,40765.0,-0.994264,-0.531828,-0.023844,-0.733079,-0.837511,-0.686988,-0.58772,-0.158796,-0.65971,...,-0.231334,-0.548361,-0.170534,-0.330922,-0.186486,-0.330421,-0.064629,-0.023004,5.38,0.0
50%,59759.5,-0.20215,0.11522,0.644815,0.132825,-0.235912,-0.199559,-0.035196,0.059746,-0.080024,...,-0.056455,-0.071308,-0.037522,0.060633,0.14145,-0.059333,0.009167,0.021554,21.49,0.0
75%,77457.0,1.17966,0.807645,1.309122,0.947324,0.357933,0.451325,0.456219,0.354005,0.646222,...,0.124019,0.353865,0.095999,0.414848,0.402274,0.274142,0.08969,0.078277,76.15,0.0
max,118120.0,2.439207,22.057729,9.382558,16.875344,34.801666,22.529298,36.677268,20.007208,15.594995,...,27.202839,10.50309,19.002942,4.022866,7.519589,3.517346,12.152401,33.847808,19656.53,1.0


In [38]:
# Counting the class columns values
df['Class'].dropna()
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,166151
1.0,358


In [39]:
df['Class'].dropna(axis=0)


Unnamed: 0,Class
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
166504,0.0
166505,0.0
166506,0.0
166507,0.0


In [40]:
# loading the data into X & Y variables
X = df.drop(columns='Class', axis=1)
y = df['Class']

In [41]:
# making a numerical pipeline based on our splitted data
X_pipeline = Pipeline([
    ("Scaler", StandardScaler()),
     ("Imputer", SimpleImputer(strategy="median"))
])


In [42]:
# fitting the X data into the numerical pipeline we've created
X_data = X_pipeline.fit_transform(X)

In [43]:
# now checking the df after fitting the pipline
# also the data is in the numpy array so first we have to convert it into DataFrame
pipelined_X = pd.DataFrame(X_data, columns=X.columns)
pipelined_X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-2.24588,-0.637374,-0.074916,1.468006,0.915698,-0.112634,0.310119,0.270594,0.050564,0.296921,...,0.303657,0.017818,0.554628,-0.148922,0.095476,0.066261,-0.41223,0.335413,-0.076291,0.257733
1,-2.24588,0.745591,0.135964,-0.260306,0.235664,0.185938,-0.111194,0.006418,0.039523,-0.238934,...,-0.140682,-0.261215,-0.828516,0.215476,-0.584747,0.150075,0.229014,-0.028479,0.039661,-0.346165
2,-2.245843,-0.636587,-0.863465,0.91146,0.185667,-0.236217,1.345024,0.728472,0.171541,-1.328649,...,0.682935,0.375983,1.299906,1.60609,-1.169084,-0.923468,-0.310411,-0.146856,-0.201712,1.199112
3,-2.245843,-0.424084,-0.144877,0.925888,-0.723289,0.133225,0.917101,0.268943,0.276912,-1.2182,...,-0.33333,-0.103218,0.14329,-0.286322,-1.982282,1.191928,-0.479028,0.154578,0.191122,0.150377
4,-2.245805,-0.528124,0.516485,0.747742,0.202671,-0.164257,0.026691,0.563761,-0.249269,0.689763,...,0.521505,0.029756,1.340048,-0.195357,0.219788,-0.659575,0.995223,0.554614,0.689241,-0.069555


In [44]:
# Checking the null values as we have applied the simple imputer in the pipeline
pipelined_X.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [45]:
y.isna().sum()
y = y.fillna(1)

In [46]:
# Split into training and testing sets and also applying an stratify on y to to distribute all fraud values in equal ration among all training and test data
X_train, X_test, y_train, y_test = train_test_split(pipelined_X, y,stratify=y ,test_size=0.2, random_state=42)

In [47]:
# printing the ratio of fraud values among all data to check if stratify is applied correctly or not
print("Fraud ratio in full data", y.mean().round(4))
print("Fraud ratio in train data", y_train.mean().round(4))
print("Fraud ratio in test data", y_test.mean().round(4))

Fraud ratio in full data 0.0022
Fraud ratio in train data 0.0022
Fraud ratio in test data 0.0022


In [48]:
# storing the logistic regression model into a variable
model = LogisticRegression()
model.fit(X_train, y_train)

In [49]:
# now checking the accuracy based on our training data
X_train_prediction = model.predict(X_train)
X_train_accuracy = accuracy_score(X_train_prediction, y_train)
print("Accuracy of training data is ", X_train_accuracy)

Accuracy of training data is  0.9989339979580806


In [50]:
# now checking the accuracy based on our testing data
X_test_prediction = model.predict(X_test)
X_test_accuracy = accuracy_score(X_test_prediction, y_test)
print("Accuracy of testing data is ", X_test_accuracy)

Accuracy of testing data is  0.9987988709386824


Building a Predictive System

In [51]:
input_data = ()

# changing the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the input data to be 2D
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# fitting and transforming the input data in the pipelines
input_data_transformed = X_pipeline.transform(input_data_reshaped)

prediction = model.predict(input_data_transformed)
print(prediction)

if (prediction[0] == 0):
  print("The Credit Card is valid")

else:
    print("The Credit Card is Fraud")

[0.]
The Credit Card is valid
