In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [65]:
data = pd.read_csv("creditcard.csv")

In [66]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


<h3>Time referencing to time between transactions</h3>
<h3>n £ N, Vn refers to numerical values referencing to transaction details. The name is Vn due to privacy protect for credit card owners</h3>
<h3>Amout in US dollars</h3>
<h3>Class is result: 0 : Normal transaction, 1: Fraud transaction</h3>



In [67]:
data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

<h5>Data understanding and preprocess</h5>


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [69]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

<h5> => No missing values</h5>

In [70]:
data.groupby('Class').count()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,284315,284315,284315,284315,284315,284315,284315,284315,284315,284315,...,284315,284315,284315,284315,284315,284315,284315,284315,284315,284315
1,492,492,492,492,492,492,492,492,492,492,...,492,492,492,492,492,492,492,492,492,492


<h5>=> Data is unbalanced</h5>

In [71]:
legit = data[data["Class"] == 0]
fraud = data[data["Class"] == 1]

In [72]:
legit.shape

(284315, 31)

In [73]:
fraud.shape

(492, 31)

In [74]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [75]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [76]:
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


<h5>Under-Sampling to resolve the problem of unbalanced data</h5>

In [77]:
legit_sample = legit.sample(492)

In [78]:
legit_sample.shape

(492, 31)

In [79]:
bdata = pd.concat([legit_sample,fraud], axis=0)

In [80]:
bdata.shape

(984, 31)

In [81]:
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


<h5>Data splitting</h5>

In [82]:
X = bdata.drop(columns='Class', axis=1)
Y = bdata["Class"]

In [83]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
57333,47864.0,-0.715584,0.652169,2.096325,0.971325,0.193529,1.038692,-0.068082,0.099158,0.551622,...,-0.001206,0.155796,0.639989,-0.386029,-0.929767,-0.077347,-0.092598,-0.401268,-0.065969,1.00
273172,165459.0,-0.494970,0.373064,0.410466,-1.986520,-1.044735,0.725061,-2.820192,-5.178839,-0.740618,...,1.221939,-2.189379,1.728706,-0.127462,-0.574761,1.018338,0.037727,0.043171,0.261651,7.95
150043,92444.0,-1.163309,0.230199,-0.802871,-0.978703,2.497977,3.652116,-0.628092,1.538586,1.225473,...,0.317659,-0.302959,-1.060334,0.275733,0.500167,-0.187218,-0.000532,0.115069,-0.048043,90.00
253802,156426.0,-1.037862,1.028276,0.731000,-0.790329,-0.801560,-0.773133,-0.320124,0.819421,0.212170,...,-0.383727,-0.064629,-0.438965,0.113735,0.014831,-0.190000,-0.281377,-0.340262,-0.143519,3.70
189604,128479.0,2.026496,-0.258936,-0.523166,0.247482,-0.349581,-0.291949,-0.510631,0.006677,1.198420,...,-0.201278,-0.161938,-0.384397,0.388322,0.525637,-0.400600,-0.637905,0.028110,-0.026223,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [84]:
Y

57333     0
273172    0
150043    0
253802    0
189604    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64

In [85]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

<h5>Model</h5>

In [86]:
model = LogisticRegression()

In [87]:
model.fit(X_train, Y_train)

In [88]:
X_train_pred = model.predict(X_train)

In [89]:
Trainacc = accuracy_score(X_train_pred, Y_train)
print(Trainacc)

0.9428208386277002


In [90]:
X_test_pred = model.predict(X_test)
Testacc = accuracy_score(X_test_pred, Y_test)

In [91]:
print(Testacc)

0.9137055837563451


In [92]:
bdata.sample(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
165679,117591.0,-0.789173,-0.590835,-0.062895,-1.479803,1.966075,4.372635,0.783475,0.819759,-0.01076,...,0.377749,0.701276,0.407895,0.708012,-0.201908,0.326229,-0.1166,-0.055032,270.0,0
177195,123078.0,-1.07382,0.415616,-2.273977,1.536844,-0.758697,-1.670381,-2.37714,0.09037,0.004847,...,0.535542,0.863592,0.450743,-0.144228,-0.205609,-0.539073,0.503418,-0.237807,11.0,1
37046,38786.0,1.204908,0.225886,0.625681,0.549949,-0.499266,-0.779597,-0.021088,-0.085416,-0.265978,...,-0.190009,-0.573591,0.147123,0.537075,0.168769,0.063733,-0.030821,0.011287,0.89,0
21826,31880.0,-1.364276,-0.404955,1.237717,1.102449,0.337029,-0.523323,1.04512,-0.051326,-1.133407,...,0.396504,0.497203,0.448393,0.332648,0.216338,-0.29938,0.013806,0.149579,250.0,0
150207,92801.0,-0.464714,2.346534,0.393417,4.443277,1.099339,0.122118,1.243171,-0.446801,-0.196856,...,-0.243269,0.237748,-0.164465,-0.161347,-0.373135,0.343487,0.578645,0.236812,7.5,0
157790,110434.0,1.926939,-0.17067,-0.732108,0.618271,-0.187101,-0.012426,-0.796236,0.089029,2.145785,...,0.091421,0.632797,0.135278,0.607349,-0.275592,0.552918,-0.044619,-0.030794,19.99,0
43061,41353.0,-15.020981,8.07524,-16.298091,5.66482,-11.918153,-4.246957,-14.716668,9.435084,-6.795398,...,2.525115,-0.832074,-0.186117,0.429781,0.697103,0.056031,-1.310888,-0.707403,34.12,1
218442,141320.0,-6.352337,-2.370335,-4.875397,2.335045,-0.809555,-0.413647,-4.082308,2.239089,-1.98636,...,1.325218,1.226745,-1.485217,-1.470732,-0.240053,0.112972,0.910591,-0.650944,195.66,1
215132,139951.0,-2.921944,-0.228062,-5.877289,2.201884,-1.93544,0.631141,-1.245106,1.511348,-1.899987,...,1.441622,0.895528,1.385511,-2.028024,0.509131,0.172643,0.726781,0.234514,723.21,1
181966,125200.0,-0.769172,1.342212,-2.171454,-0.151513,-0.648374,-0.973504,-1.706658,0.313745,-1.982302,...,-0.036122,-0.753591,-0.047113,0.358493,-0.287407,0.476506,0.250531,0.250987,40.0,1


In [93]:
X_test.sample(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
103646,68745.0,-2.118559,1.002092,1.449173,2.715825,0.151568,2.461184,-0.063759,-0.885657,-1.325422,...,-0.765646,1.260185,-0.783653,0.424028,-1.01921,-0.38434,0.084483,0.059434,-0.33651,151.37
59543,48885.0,0.987982,-0.700968,1.152123,0.107703,-1.227214,0.218737,-0.840564,0.349589,0.942452,...,-0.013386,0.004357,0.008936,0.055275,0.291678,-0.072443,0.98544,-0.039573,0.014549,74.95
43160,41397.0,-14.970346,8.401421,-16.867238,8.252334,-13.56513,-2.782438,-14.263735,9.643419,-7.701499,...,-1.61466,2.714045,-0.101355,-0.439666,0.519514,0.789328,0.064357,-1.621386,-1.104819,273.01
251881,155542.0,1.868226,1.363077,-1.994934,4.173516,1.239751,-0.746186,0.572731,-0.131235,-1.551839,...,-0.172216,-0.301001,-0.818972,0.206812,-0.263683,-0.114958,-0.240672,-0.006629,0.017258,3.14
276114,166916.0,-1.583099,0.946614,-0.487788,0.629738,0.516535,-1.143112,0.678413,0.559451,-0.914078,...,-0.308042,0.136978,-0.015899,-0.537042,-0.083533,1.400881,-0.358941,-0.483268,-0.37705,33.29
58761,48533.0,1.243848,0.524526,-0.538884,1.209196,0.479538,-0.197429,0.049166,0.037792,0.128119,...,-0.171541,-0.05166,-0.084089,-0.192846,-0.917392,0.681953,-0.194419,0.045917,0.040136,1.0
141423,84313.0,-0.587964,0.637341,1.577151,0.07801,0.033304,0.255188,0.364271,0.319267,-0.050974,...,0.056836,0.064248,0.477493,-0.132197,0.059726,-0.246688,0.37635,0.363719,0.187754,17.11
49995,44296.0,-1.007648,0.722149,0.779027,-1.466661,0.298457,-0.263997,0.442429,0.400734,-0.591588,...,-0.016372,0.137489,0.171923,-0.205126,-0.650297,0.072522,1.391071,-0.167746,-0.019007,39.99
119298,75399.0,1.305915,-0.342757,-1.011567,-1.227429,1.722429,3.241043,-0.764172,0.827594,0.256746,...,0.095087,-0.176588,-0.684335,0.04603,1.055219,0.271706,0.918115,-0.071391,0.005627,19.21
42784,41243.0,-10.940739,6.261586,-14.182339,7.183602,-9.951363,-3.86082,-13.547302,7.096472,-6.294029,...,-0.088342,2.267448,-0.492029,-0.239303,0.454368,-0.101611,0.446997,0.062293,-0.43977,45.49
