In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('no_missing.xlsx', converters={'Merchnum': lambda x: str(x), 'Merch_zip': lambda x : str(x)})

In [3]:
df = df.drop(['Unnamed: 0','assign','minus'],axis = 1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96397 entries, 0 to 96396
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Merch_description       96397 non-null  object        
 1   Merchnum                96397 non-null  object        
 2   Recnum                  96397 non-null  int64         
 3   Cardnum                 96397 non-null  int64         
 4   Date                    96397 non-null  datetime64[ns]
 5   Merch_state             96397 non-null  object        
 6   Merch_zip               96397 non-null  object        
 7   Amount                  96397 non-null  float64       
 8   Fraud                   96397 non-null  int64         
 9   Merch_description_part  96397 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 7.4+ MB


## 1. Benford's Law Variable for merchnum and cardnum

In [5]:
old = df.copy()

In [6]:
old['stramount'] = old['Amount'].astype('str')

In [7]:
old['stramount'] = old['stramount'].str.replace('.','')
old['stramount'] = old['stramount'].astype(int).astype('str')

In [8]:
old[old['stramount'].str.startswith('0')]

Unnamed: 0,Merch_description,Merchnum,Recnum,Cardnum,Date,Merch_state,Merch_zip,Amount,Fraud,Merch_description_part,stramount


In [9]:
old['Merch_description'] = old['Merch_description'].str.lower()

In [10]:
old = old[~old['Merch_description'].str.contains('fedex')]

In [11]:
old['digit_1'] = old['stramount'].str[0]

In [12]:
old['digit_1'] = old['digit_1'].astype(int)

In [13]:
old = old[['Recnum','Merchnum','Cardnum','digit_1']]

In [14]:
old['n_low'] = old['digit_1'] <= 2
old['n_high'] = old['digit_1'] > 2

In [15]:
new_card = old[['Recnum','Cardnum']]
new_merch = old[['Recnum','Merchnum']]

### 1.1 For each Cardnum

In [16]:
benford_card = pd.merge(new_card, old, left_on = 'Cardnum', right_on = 'Cardnum')

In [17]:
benford_card = benford_card[benford_card['Recnum_x'] >= benford_card['Recnum_y']]

In [18]:
benford_card.head()

Unnamed: 0,Recnum_x,Cardnum,Recnum_y,Merchnum,digit_1,n_low,n_high
0,2,5142183973,2,61003026333,3,False,True
207,235,5142183973,2,61003026333,3,False,True
208,235,5142183973,235,456902380335,3,False,True
414,3639,5142183973,2,61003026333,3,False,True
415,3639,5142183973,235,456902380335,3,False,True


In [19]:
benford_card = benford_card.groupby(['Recnum_x','Cardnum']).agg({'n_low':'sum','n_high':'sum'})

In [20]:
benford_card.loc[(235,5142183973),]

n_low     0.0
n_high    2.0
Name: (235, 5142183973), dtype: float64

In [21]:
benford_card.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_low,n_high
Recnum_x,Cardnum,Unnamed: 2_level_1,Unnamed: 3_level_1
2,5142183973,0.0,1.0
3,5142131721,1.0,0.0
8,5142191182,1.0,0.0
9,5142258629,0.0,1.0
14,5142124791,1.0,0.0


In [22]:
c = 3
n_mid = 15

In [23]:
benford_card['n_low'] = benford_card['n_low'].replace(0,1)
benford_card['n_high'] = benford_card['n_high'].replace(0,1)
benford_card['n'] = benford_card['n_low'] + benford_card['n_high']
benford_card['R'] = benford_card['n_low']/benford_card['n_high'] *1.096
benford_card['1/R'] = 1/benford_card['R']
benford_card['t'] = (benford_card['n'] - n_mid)/c
benford_card['U']  = benford_card[['R','1/R']].max(axis = 1)
benford_card['U_star_card'] = (benford_card['U'] -1)/(1+np.exp(-benford_card['t'])) +1

In [24]:
benford_card = benford_card.reset_index()

In [25]:
benford_card

Unnamed: 0,Recnum_x,Cardnum,n_low,n_high,n,R,1/R,t,U,U_star_card
0,2,5142183973,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
1,3,5142131721,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
2,8,5142191182,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
3,9,5142258629,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
4,14,5142124791,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
...,...,...,...,...,...,...,...,...,...,...
84617,96749,5142276053,2.0,2.0,4.0,1.096000,0.912409,-3.666667,1.096000,1.002393
84618,96750,5142225701,89.0,111.0,200.0,0.878775,1.137948,61.666667,1.137948,1.137948
84619,96751,5142226486,60.0,79.0,139.0,0.832405,1.201338,41.333333,1.201338,1.201338
84620,96752,5142244619,26.0,19.0,45.0,1.499789,0.666760,10.000000,1.499789,1.499767


### 1.2 For each Merchnum

In [26]:
benford_merch = pd.merge(new_merch, old, left_on = 'Merchnum', right_on = 'Merchnum')
benford_merch = benford_merch[benford_merch['Recnum_x'] >= benford_merch['Recnum_y']]
benford_merch = benford_merch.groupby(['Recnum_x','Merchnum']).agg({'n_low':'sum','n_high':'sum'})
benford_merch['n_low'] = benford_merch['n_low'].replace(0,1)
benford_merch['n_high'] = benford_merch['n_high'].replace(0,1)
benford_merch['n'] = benford_merch['n_low'] + benford_merch['n_high']
benford_merch['R'] = benford_merch['n_low']/benford_merch['n_high'] *1.096
benford_merch['1/R'] = 1/benford_merch['R']
benford_merch['t'] = (benford_merch['n'] - n_mid)/c
benford_merch['U']  = benford_merch[['R','1/R']].max(axis = 1)
benford_merch['U_star_merch'] = (benford_merch['U'] -1)/(1+np.exp(-benford_merch['t'])) +1
benford_merch = benford_merch.reset_index()

In [27]:
benford_merch

Unnamed: 0,Recnum_x,Merchnum,n_low,n_high,n,R,1/R,t,U,U_star_merch
0,2,61003026333,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
1,3,4503082993600,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
2,8,6098208200062,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
3,9,602608969534,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
4,14,5725000466504,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
...,...,...,...,...,...,...,...,...,...,...
84617,96749,3500000006160,1.0,1.0,2.0,1.096000,0.912409,-4.333333,1.096000,1.001244
84618,96750,8090710030950,17.0,24.0,41.0,0.776333,1.288106,8.666667,1.288106,1.288057
84619,96751,4503057341100,110.0,109.0,219.0,1.106055,0.904114,68.000000,1.106055,1.106055
84620,96752,8834000695412,232.0,247.0,479.0,1.029441,0.971401,154.666667,1.029441,1.029441


In [28]:
benford_value = pd.merge(benford_card[['Recnum_x','U_star_card']], benford_merch[['Recnum_x','U_star_merch']], left_on = 'Recnum_x', right_on = 'Recnum_x')

In [29]:
benford_value.columns = ['Recnum','U_star_card','U_star_merch']

In [30]:
benford_value = benford_value.set_index('Recnum')

In [31]:
benford = df[['Recnum']].set_index('Recnum')

In [32]:
benford = pd.merge(benford, benford_value, left_index = True, right_index = True, how = 'left')

In [33]:
benford['U_star_card'] = benford['U_star_card'].fillna(1)
benford['U_star_merch'] = benford['U_star_merch'].fillna(1)

In [34]:
benford

Unnamed: 0_level_0,U_star_card,U_star_merch
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,1.000000
2,1.001244,1.001244
3,1.001244,1.001244
4,1.000000,1.000000
5,1.000000,1.000000
...,...,...
96749,1.002393,1.001244
96750,1.137948,1.288057
96751,1.201338,1.106055
96752,1.499767,1.029441


In [35]:
benford.to_csv('Benford.csv')