### Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
# loading the dataset to a Pandas DataFrame
data=pd.read_csv("card_transdata.csv")
data

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0


In [3]:
# dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


In [4]:
data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [5]:
# first 5 rows of dataset
data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [6]:
# last 5 rows of dataset
data.tail()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0
999999,58.108125,0.31811,0.38692,1.0,1.0,0.0,1.0,0.0


In [7]:
# checking the number of missing values in each column
data.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [8]:
# distribution of legit transaction & fradulent transactions
data["fraud"].value_counts()

0.0    912597
1.0     87403
Name: fraud, dtype: int64

### This dataset is unbalanced

0.0 -> Normal Transaction
1.0 -> Fradulent Transaction

In [9]:
# separating the data for analysis
legit=data[data.fraud==0]
fraud=data[data.fraud==1]

In [10]:
legit.shape

(912597, 8)

In [11]:
fraud.shape

(87403, 8)

In [12]:
legit.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,912597.0,912597.0,912597.0,912597.0,912597.0,912597.0,912597.0,912597.0
mean,22.832976,4.301391,1.423642,0.881672,0.359402,0.109944,0.622225,0.0
std,52.828655,22.472359,1.946152,0.322997,0.479825,0.312821,0.484831,0.0
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.828942,0.293859,0.449768,1.0,0.0,0.0,0.0,0.0
50%,9.673847,0.985074,0.91395,1.0,0.0,0.0,1.0,0.0
75%,24.158057,3.268578,1.788918,1.0,1.0,0.0,1.0,0.0
max,8777.13642,11851.104565,267.802942,1.0,1.0,1.0,1.0,0.0


In [13]:
fraud.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0
mean,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318,1.0
std,134.391608,47.997697,5.56432,0.324825,0.436647,0.055801,0.225391,0.0
min,0.025847,0.000407,0.011966,0.0,0.0,0.0,0.0,1.0
25%,4.585729,0.328199,3.50027,1.0,0.0,0.0,1.0,1.0
50%,15.454219,1.157631,5.071294,1.0,0.0,0.0,1.0,1.0
75%,101.110104,4.598504,7.331222,1.0,1.0,0.0,1.0,1.0
max,10632.723672,2160.499922,266.689692,1.0,1.0,1.0,1.0,1.0


In [14]:
# compare the values for both transactions
data.groupby("fraud").mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,22.832976,4.301391,1.423642,0.881672,0.359402,0.109944,0.622225
1.0,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318


### under sampling
build a sample dataset containg similar distribution of normal and fraudulent transaction
#### number of fraudulent transaction -> 87403

In [15]:
legit_sample=legit.sample(n=87403)
legit_sample

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
553897,0.868642,0.153422,2.067156,0.0,0.0,0.0,1.0,0.0
252447,5.803407,0.300151,0.633564,1.0,1.0,0.0,0.0,0.0
229070,14.905105,0.211472,3.342466,1.0,1.0,1.0,1.0,0.0
414500,1.201834,3.351519,1.334104,0.0,0.0,0.0,0.0,0.0
224462,22.500800,0.184560,0.336541,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
702773,33.587174,2.229969,0.219423,1.0,0.0,0.0,1.0,0.0
24507,5.199014,0.201063,2.352227,1.0,0.0,0.0,1.0,0.0
317734,55.886999,1.255651,1.453254,1.0,0.0,0.0,1.0,0.0
391948,25.887453,0.908566,0.259493,1.0,1.0,0.0,1.0,0.0


###  Concatenating two dataframes

In [16]:
new_data=pd.concat([legit_sample,fraud],axis=0)
new_data

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
553897,0.868642,0.153422,2.067156,0.0,0.0,0.0,1.0,0.0
252447,5.803407,0.300151,0.633564,1.0,1.0,0.0,0.0,0.0
229070,14.905105,0.211472,3.342466,1.0,1.0,1.0,1.0,0.0
414500,1.201834,3.351519,1.334104,0.0,0.0,0.0,0.0,0.0
224462,22.500800,0.184560,0.336541,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999908,45.296658,0.882736,8.856861,1.0,0.0,0.0,1.0,1.0
999916,167.139756,0.282253,0.308468,1.0,0.0,0.0,1.0,1.0
999919,124.640118,0.004416,0.434885,1.0,0.0,0.0,1.0,1.0
999939,51.412900,3.429330,29.914254,1.0,0.0,0.0,1.0,1.0


In [17]:
new_data["fraud"].value_counts()

1.0    87403
0.0    87403
Name: fraud, dtype: int64

In [18]:
new_data.groupby("fraud").mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,23.220706,4.404421,1.423714,0.881697,0.361143,0.109893,0.623056
1.0,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318


### splitting the data into features and target

In [19]:
x=new_data.iloc[:,:-1]
y=new_data.iloc[:,-1]

In [20]:
x

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
553897,0.868642,0.153422,2.067156,0.0,0.0,0.0,1.0
252447,5.803407,0.300151,0.633564,1.0,1.0,0.0,0.0
229070,14.905105,0.211472,3.342466,1.0,1.0,1.0,1.0
414500,1.201834,3.351519,1.334104,0.0,0.0,0.0,0.0
224462,22.500800,0.184560,0.336541,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...
999908,45.296658,0.882736,8.856861,1.0,0.0,0.0,1.0
999916,167.139756,0.282253,0.308468,1.0,0.0,0.0,1.0
999919,124.640118,0.004416,0.434885,1.0,0.0,0.0,1.0
999939,51.412900,3.429330,29.914254,1.0,0.0,0.0,1.0


In [21]:
y

553897    0.0
252447    0.0
229070    0.0
414500    0.0
224462    0.0
         ... 
999908    1.0
999916    1.0
999919    1.0
999939    1.0
999949    1.0
Name: fraud, Length: 174806, dtype: float64

### splitting the data into training data & testing data

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [23]:
# Normalize the data
sc = StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

#### Model Training
### Logistic Regression

In [24]:
model=LogisticRegression()

In [25]:
# training the logistic regression model with training data
model.fit(x_train,y_train)

LogisticRegression()

#### Model Evaluation
Accuracy Score

In [26]:
# accuracy on training data
x_train_prediction=model.predict(x_train)
training_data_accuracy=accuracy_score(x_train_prediction,y_train)
training_data_accuracy

0.9401332913818254

In [27]:
# accuracy on test data
x_test_prediction=model.predict(x_test)
test_data_accuracy=accuracy_score(x_test_prediction,y_test)
test_data_accuracy

0.9394771466163263

In [28]:
model.predict(sc.transform([[24.956183,0.154885,1.558342,1.0,0.0,1.0,1.0]]))

array([0.])