In [31]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
df = pd.read_csv("data/bs_clean.csv")
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,C1093826151,4,M,M348934600,transportation,4.55,0
1,0,C352968107,2,M,M348934600,transportation,39.68,0
2,0,C2054744914,4,F,M1823072687,transportation,26.89,0
3,0,C1760612790,3,M,M348934600,transportation,17.25,0
4,0,C757503768,5,M,M348934600,transportation,35.72,0


In [33]:
Q1 = df.amount.quantile(0.25)
Q3 = df.amount.quantile(0.75)
IQR = Q3-Q1

#conditions = [(df.amount > (Q3+1.5*IQR)), (df.amount < (Q1-1.5*IQR)), (df.amount < (Q3+1.5*IQR)) | (df.amount > (Q1+1.5*IQR))]
conditions = [(df.amount > (Q3+1.5*IQR)), (df.amount < (Q3+1.5*IQR))]
#values = [1, 1, 0]
values = [1, 0]
df["IQR_outlier"] = np.select(conditions, values)
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud,IQR_outlier
0,0,C1093826151,4,M,M348934600,transportation,4.55,0,0
1,0,C352968107,2,M,M348934600,transportation,39.68,0,0
2,0,C2054744914,4,F,M1823072687,transportation,26.89,0,0
3,0,C1760612790,3,M,M348934600,transportation,17.25,0,0
4,0,C757503768,5,M,M348934600,transportation,35.72,0,0


In [34]:
lowerBound = Q1 - 1.5*IQR
upperBound = Q3 + 1.5*IQR
print("The lower outlier bound for amount is: ", lowerBound) # non-existant
print("The upper outlier bound for amount is: ", upperBound)

The lower outlier bound for amount is:  -29.459999999999994
The upper outlier bound for amount is:  85.74


In [35]:
df[df.amount > 85.74].shape

(25798, 9)

In [36]:
#sns.set(rc = {'figure.figsize':(15,8)})
#graph = sns.stripplot(data=df, x="step", y="amount", hue="fraud")
#graph.axhline(Q3 + 1.5*IQR, color = 'r')
#plt.xticks(np.arange(0, 180, 18))
#plt.show()
df.groupby("category").amount.mean()

category
barsandrestaurants      43.461014
content                 44.547571
fashion                 65.666642
food                    37.070405
health                 135.621367
home                   165.670846
hotelservices          205.614249
hyper                   45.970421
leisure                288.911303
otherservices          135.881524
sportsandtoy           215.715280
tech                   120.947937
transportation          26.958187
travel                2250.409190
wellnessandbeauty       65.511221
Name: amount, dtype: float64

In [37]:
mean = df["amount"].mean()
std = np.std(df["amount"])
print('mean of the dataset is', mean)
print('std. deviation is', std)

threshold = 1.5
outlier = []
for i in df["amount"]:
    z = (i-mean)/std
    if abs(z) > threshold:
        outlier.append(i)
print('The number of outliers in the dataset is', len(outlier))


mean of the dataset is 37.89013530807561
std. deviation is 111.40273725877348
The number of outliers in the dataset is 7341


In [38]:
categories = df.category.unique()

for cat in categories:
    print(cat)

transportation
health
otherservices
food
hotelservices
barsandrestaurants
tech
sportsandtoy
wellnessandbeauty
hyper
fashion
home
content
travel
leisure


In [39]:
#values are returning NaN for some reason

categories = df.category.unique()
stats = []

for cat in categories:
    result = {}
    result['category'] = cat
    cat_df = df[df['category'] == cat]
    
    cat_mean = cat_df['amount'].mean()
    result['mean'] = cat_mean

    cat_std = np.std(cat_df['amount'])
    result['std'] = cat_std
    
    threshold = 1.5
    outlier = []
    for i in cat_df["amount"]:
        cat_z = (i-cat_mean)/cat_std
        if abs(cat_z) > threshold:
            outlier.append(i)
        result['num_of_outliers'] = len(outlier)
    stats.append(result)
    
print(stats)


[{'category': 'transportation', 'mean': 26.958187001478866, 'std': 17.527860309506565, 'num_of_outliers': 48662}, {'category': 'health', 'mean': 135.6213667637761, 'std': 153.1560169711103, 'num_of_outliers': 887}, {'category': 'otherservices', 'mean': 135.881524122807, 'std': 158.53973184764058, 'num_of_outliers': 74}, {'category': 'food', 'mean': 37.07040489068333, 'std': 24.99849330783595, 'num_of_outliers': 2266}, {'category': 'hotelservices', 'mean': 205.614248853211, 'std': 225.4457247642073, 'num_of_outliers': 181}, {'category': 'barsandrestaurants', 'mean': 43.4610136513416, 'std': 38.400959515453344, 'num_of_outliers': 325}, {'category': 'tech', 'mean': 120.94793670886075, 'std': 127.01787707562144, 'num_of_outliers': 111}, {'category': 'sportsandtoy', 'mean': 215.71527986006998, 'std': 215.64543192606433, 'num_of_outliers': 429}, {'category': 'wellnessandbeauty', 'mean': 65.51122099960229, 'std': 65.40673775291049, 'num_of_outliers': 664}, {'category': 'hyper', 'mean': 45.970

In [40]:
df["z_score"] = (df.amount-mean)/std
conditions = [(df.z_score > 1.5), (df.z_score < (-1.5)), (df.z_score < (1.5)) | (df.z_score > (-1.5))]
values = [1, 1, 0]
df["Z_outlier"] = np.select(conditions, values)
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud,IQR_outlier,z_score,Z_outlier
0,0,C1093826151,4,M,M348934600,transportation,4.55,0,0,-0.299276,0
1,0,C352968107,2,M,M348934600,transportation,39.68,0,0,0.016067,0
2,0,C2054744914,4,F,M1823072687,transportation,26.89,0,0,-0.098742,0
3,0,C1760612790,3,M,M348934600,transportation,17.25,0,0,-0.185275,0
4,0,C757503768,5,M,M348934600,transportation,35.72,0,0,-0.01948,0


In [41]:
df.fraud.value_counts()
df.IQR_outlier.value_counts()
df.Z_outlier.value_counts()

0    587443
1      7200
Name: fraud, dtype: int64

0    568845
1     25798
Name: IQR_outlier, dtype: int64

0    587302
1      7341
Name: Z_outlier, dtype: int64

In [43]:
df.groupby("category").amount.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
barsandrestaurants,6373.0,43.461014,38.403973,0.01,17.17,36.27,61.49,695.63
content,885.0,44.547571,33.155431,0.01,17.5,38.43,65.2,185.13
fashion,6454.0,65.666642,55.487908,0.01,26.655,55.615,91.8175,773.61
food,26254.0,37.070405,24.998969,0.0,16.9625,33.475,53.51,154.91
health,16133.0,135.621367,153.160764,0.03,45.3,97.89,170.55,1972.81
home,1986.0,165.670846,192.664019,0.13,53.675,111.31,197.4975,1540.23
hotelservices,1744.0,205.614249,225.510387,0.02,64.59,131.94,229.895,1429.04
hyper,6098.0,45.970421,45.538458,0.02,17.93,36.75,60.6775,488.02
leisure,499.0,288.911303,95.816247,38.74,233.265,296.66,353.905,592.03
otherservices,912.0,135.881524,158.626722,0.06,38.6325,87.28,154.1625,964.3


In [44]:
mfups = df[df.IQR_outlier != df.fraud]
mfups.groupby("category").fraud.count()

category
barsandrestaurants     601
content                105
fashion               1750
food                  1091
health                7676
home                   966
hotelservices          749
hyper                  499
leisure                  9
otherservices          302
sportsandtoy          1225
tech                  1099
transportation         869
travel                 142
wellnessandbeauty     3507
Name: fraud, dtype: int64

In [42]:
df.groupby("fraud").category.value_counts()

fraud  category          
0      transportation        505119
       food                   26254
       health                 14437
       wellnessandbeauty      14368
       fashion                 6338
       barsandrestaurants      6253
       hyper                   5818
       tech                    2212
       sportsandtoy            2020
       home                    1684
       hotelservices           1196
       content                  885
       otherservices            684
       travel                   150
       leisure                   25
1      sportsandtoy            1982
       health                  1696
       wellnessandbeauty        718
       travel                   578
       hotelservices            548
       leisure                  474
       home                     302
       hyper                    280
       otherservices            228
       tech                     158
       barsandrestaurants       120
       fashion                  116
Na

In [47]:
df.groupby("Z_outlier").category.value_counts()

Z_outlier  category          
0          transportation        505119
           food                   26254
           wellnessandbeauty      14703
           health                 13349
           fashion                 6372
           barsandrestaurants      6340
           hyper                   6000
           sportsandtoy            2592
           tech                    2045
           home                    1522
           hotelservices           1234
           content                  885
           otherservices            747
           leisure                   88
           travel                    52
1          health                  2784
           sportsandtoy            1410
           travel                   676
           hotelservices            510
           home                     464
           leisure                  411
           wellnessandbeauty        383
           tech                     325
           otherservices            165
          

In [45]:
fups = df[df.Z_outlier != df.fraud]

In [46]:
fups.groupby("category").fraud.count()

category
barsandrestaurants      87
fashion                 80
health                2096
home                   314
hotelservices          256
hyper                  182
leisure                 63
otherservices           95
sportsandtoy           774
tech                   249
travel                 142
wellnessandbeauty      413
Name: fraud, dtype: int64

In [48]:
fups.groupby("category").amount.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
barsandrestaurants,87.0,97.876897,59.005664,2.37,44.06,95.16,142.535,203.28
fashion,80.0,140.061625,76.783754,4.35,62.1925,141.215,211.5425,269.39
health,2096.0,218.509976,84.22643,0.03,205.8175,230.155,265.86,468.81
home,314.0,228.047484,88.928099,0.19,205.5275,229.51,279.015,523.11
hotelservices,256.0,159.368789,87.729099,0.41,80.4825,178.15,225.51,345.87
hyper,182.0,100.178407,61.745284,1.13,45.47,96.35,159.94,203.8
leisure,63.0,161.879365,34.562578,45.46,147.34,170.53,188.695,202.93
otherservices,95.0,125.645579,71.129121,7.66,65.63,127.24,189.605,298.81
sportsandtoy,774.0,119.334457,71.83668,0.42,59.455,116.925,173.925,374.46
tech,249.0,227.136827,72.764216,9.71,211.74,231.52,264.0,454.83


In [51]:
df[df.category.isin(['transportation'])].mean()

In [50]:
# divide the dataframe by category, then calculate z-scores and attribute outlier label
# count how many z-outliers actually match to true fraud (true positive)
# sklearn methods that calculate/detect/determine outlier, that might have a better match rate to true frauds