# Project : Frauding detection

In [190]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [191]:
# Loading the dataset to a Pandas DataFrame

In [192]:
df = pd.read_csv('creditcard.csv')

In [193]:
# Checking "Missing & Duplicated values"

In [194]:
df.duplicated().sum()

1081

In [195]:
df = df.drop_duplicates()

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283726 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283726 non-null  float64
 1   V1      283726 non-null  float64
 2   V2      283726 non-null  float64
 3   V3      283726 non-null  float64
 4   V4      283726 non-null  float64
 5   V5      283726 non-null  float64
 6   V6      283726 non-null  float64
 7   V7      283726 non-null  float64
 8   V8      283726 non-null  float64
 9   V9      283726 non-null  float64
 10  V10     283726 non-null  float64
 11  V11     283726 non-null  float64
 12  V12     283726 non-null  float64
 13  V13     283726 non-null  float64
 14  V14     283726 non-null  float64
 15  V15     283726 non-null  float64
 16  V16     283726 non-null  float64
 17  V17     283726 non-null  float64
 18  V18     283726 non-null  float64
 19  V19     283726 non-null  float64
 20  V20     283726 non-null  float64
 21  V21     283726 

In [197]:
df.isnull().sum().sum()

0

In [198]:
# Distribution of legit transaction and fradulent transactions



In [199]:
df.Class.value_counts()

Class
0    283253
1       473
Name: count, dtype: int64

In [200]:
# Seperating data into 2 parts.

In [201]:
legit_df = df[df.Class == 0]
fraud_df = df[df.Class ==1]

In [202]:
print(legit_df.shape)
print(fraud_df.shape)

(283253, 31)
(473, 31)


In [203]:
# Statistical measures.

In [204]:
legit_df.Amount.describe()

count    283253.000000
mean         88.413575
std         250.379023
min           0.000000
25%           5.670000
50%          22.000000
75%          77.460000
max       25691.160000
Name: Amount, dtype: float64

In [205]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94835.058093,0.013439,-0.009829,0.012853,-0.01044,0.006769,0.001251,0.010447,-0.002448,0.002613,...,-0.000489,-0.00115,-0.00016,0.00036,0.000393,-0.000301,6.5e-05,0.001409,0.000418,88.413575
1,80450.513742,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


### Under-Sampling

In [206]:
# Randomlly take 473 data sample from legit_df.Because our fradulent data eqauls 473. We need to make balance before modelling.

In [207]:
legit_sampledf = legit_df.sample(n = 473)

In [208]:
# Concatenating 2 DataFrames

In [209]:
new_df = pd.concat([legit_sampledf,fraud_df],axis =0)

In [210]:
new_df.Class.value_counts()

Class
0    473
1    473
Name: count, dtype: int64

In [211]:
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93084.463002,-0.0367,-0.036567,-0.029872,0.042712,0.083042,-0.032446,-0.04555,-0.028822,0.007778,...,-0.007074,-0.001948,0.014713,-0.064953,0.058113,-0.018282,0.000751,0.00305,-0.02545,86.077061
1,80450.513742,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


In [None]:
# After under sampling, we make mean of Classes by features. In that way we make insight how different our sample than before.

In [212]:
#Train & Test


In [213]:
x = new_df.drop(['Class'],axis = 1)
y = new_df.Class

In [214]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=13)

In [None]:
#Scaling

In [275]:
mm = MinMaxScaler()
x_train = mm.fit_transform(x_train)
x_test = mm.fit_transform(x_test)

In [276]:
#Modelling


In [277]:
lgm = LogisticRegression()

In [278]:
model = lgm.fit(x_train,y_train)

In [279]:
y_pred = model.predict(x_test)

In [280]:
#Model Evaluation

In [281]:
model.score(x_train,y_train)

0.9298941798941799

In [282]:
acc_scr = model.score(x_test,y_test)
print("Accuracy score on Test Data :", acc_scr)
recall = recall_score(y_pred,y_test)
print("Recall score on Test Data :",recall)
precision = precision_score(y_pred,y_test)
print("Precision score on Test Data :", precision)




Accuracy score on Test Data : 0.9526315789473684
Recall score on Test Data : 0.9772727272727273
Precision score on Test Data : 0.9247311827956989


In [283]:
cnf= confusion_matrix(y_pred,y_test)
print("Confusion matrix on Test Data :", cnf)

Confusion matrix on Test Data : [[95  7]
 [ 2 86]]
