# Part 1 Benford’s Law: Simulation 

The purpose of the code of this part is to test whether the number simulated from different distributions follow the Benford's Law

In [120]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from scipy.stats import chisquare

In [55]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from scipy.stats import chisquare

In [56]:
from IPython.display import HTML

hide_me = ''
HTML('''<script>
code_show=true; 
function code_toggle() {
  if (code_show) {
    $('div.input').each(function(id) {
      el = $(this).find('.cm-variable:first');
      if (id == 0 || el.text() == 'hide_me') {
        $(this).hide();
      }
    });
    $('div.output_prompt').css('opacity', 0);
  } else {
    $('div.input').each(function(id) {
      $(this).show();
    });
    $('div.output_prompt').css('opacity', 1);
  }
  code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input style="opacity:0" type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [57]:
np.random.seed(5171991)

In [58]:
#Generate 1000 numbers from uniform distribution with min 0 max 1
x = abs(np.random.uniform(0,1,1000))

In [59]:
#Generate 1000 numbers from normal distribution with mean equals to 1 and std equals to 1
y= abs(np.random.normal(1, 1, 1000))

In [60]:
#Generate 1000 numbers from normal distribution with mean equals to 0 and std equals to 1
z=abs(np.random.normal(0, 1, 1000))

In [61]:
index=range(0,1000)
df=pd.DataFrame(data=x,index=index,columns=['X'])
df['Y']=y
df['Z']=z
df['XYZ']=df['X']*df['Y']*df['Z']
df['XYZ_round']=df['XYZ'].round(1)

In [62]:
# Get the first digit of all the numbers generated
df['X']=df['X']*100000
df['Y']=df['Y']*100000
df['Z']=df['Z']*100000
df['XYZ']=df['XYZ']*100000
df['XYZ_round']=df['XYZ_round']*10000

df['X']=df['X'].astype(str)
df['Y']=df['Y'].astype(str)
df['Z']=df['Z'].astype(str)
df['XYZ']=df['XYZ'].astype(str)
df['XYZ_round']=df['XYZ_round'].astype(str)

df['X_first']=df['X'].str[0]
df['Y_first']=df['Y'].str[0]
df['Z_first']=df['Z'].str[0]
df['XYZ_first']=df['XYZ'].str[0]
df['XYZ_first_round']=df['XYZ_round'].str[0]

In [63]:
# Get the count of first digit of all the numbers generated
X_counts=df['X_first'].value_counts().sort_index()
Y_counts=df['Y_first'].value_counts().sort_index()
Z_counts=df['Z_first'].value_counts().sort_index()
XYZ_counts=df['XYZ_first'].value_counts().sort_index()
XYZ_round_counts=df['XYZ_first_round'].value_counts().sort_index()
XYZ_round_counts=XYZ_round_counts[1:]

In [64]:
#Get the proportion of each digit
X_per=X_counts/float(sum(X_counts))
Y_per=Y_counts/float(sum(Y_counts))
Z_per=Z_counts/float(sum(Z_counts))
XYZ_per=XYZ_counts/float(sum(XYZ_counts))
XYZ_round_per=XYZ_round_counts/float(sum(XYZ_round_counts))

In [65]:
# Benchmark for Benford's Law, '0.301' means the proportion of 1 should be 0.301 if the distribution follows 
# the Benford's Law
Benchmark=[0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046]

In [66]:
# Get the KS value for each distribution, the KS value is to test whether the distribution follows the Benford's Law
ks_x_list=list()
ks_y_list=list()
ks_z_list=list()
ks_xyz_list=list()
ks_xyz_round_list=list()

for i in range(9):
    a=X_per[0:i+1].sum()-sum(Benchmark[0:i+1])
    ks_x_list.append(abs(a))
    b=Y_per[0:i+1].sum()-sum(Benchmark[0:i+1])
    ks_y_list.append(abs(b))
    c=Z_per[0:i+1].sum()-sum(Benchmark[0:i+1])
    ks_z_list.append(abs(c))
    d=XYZ_per[0:i+1].sum()-sum(Benchmark[0:i+1])
    ks_xyz_list.append(abs(d))
    e=XYZ_round_per[0:i+1].sum()-sum(Benchmark[0:i+1])
    ks_xyz_round_list.append(abs(e))
    
ks_x=max(ks_x_list)
ks_y=max(ks_y_list)
ks_z=max(ks_z_list)
ks_xyz=max(ks_xyz_list)
ks_xyz_round=max(ks_xyz_round_list)

In [67]:
# The benchmark for one distribution follows the Benford's Law
KS_benchmark=1.36/np.sqrt(1000)
KS_benchmark

0.043006976178289961

In [68]:
ks_x

0.27999999999999992

We can see that the numbers generated from uniform distribution don't follow the Benford's Law, 
the reason is that the number generated from real world is not uniform distributed.

In [69]:
ks_y

0.1150000000000001

We can see that the numbers generated from normal distribution with mean equals to 1 and std equals to 1 don't follow the Benford's Law, the reason I believe is that the number generated from real world is like normal distribution however the mean is not 1 but 0

In [70]:
ks_z

0.058999999999999997

The numbers distribution generated from normal distribution with mean equals to 0 and std equals to 1 does not pass the
benchmark test, however, it is already very near the benchmark.

In [71]:
ks_xyz

0.017000000000000015

For the x*y*z, it follows the Benford's Law. The reason is it is randomly generated.

# Part 2 Benford’s Law: Real use

The second part is test the Benford's Law in a real case. Data is from financial statement of a company, the data dictionary is shown below.

In [72]:
hide_me
from tabulate import tabulate
print tabulate([['atq', 'Total asset'], ['revtq', 'Total revenue'],['niq', 'Net income'],['oancfy', 'net operating cash flows']], headers=['Field', 'Description'])

Field    Description
-------  ------------------------
atq      Total asset
revtq    Total revenue
niq      Net income
oancfy   net operating cash flows


In [73]:
df_data=pd.read_csv('/Users/zeyuanli/Summer/Accounting/HW_03_data.csv')

In [74]:
df_data=df_data[['atq','revtq','niq','oancfy']]

In [75]:
df_data.head(10)

Unnamed: 0,atq,revtq,niq,oancfy
0,411.362,109.593,3.226,7.026
1,421.45,121.261,3.691,4.452
2,429.271,136.065,4.089,13.745
3,437.846,138.071,5.006,24.76
4,449.645,136.037,4.848,10.242
5,468.55,135.675,5.144,11.113
6,523.852,154.135,5.94,3.982
7,529.584,163.481,7.093,9.531
8,542.819,170.906,7.31,1.489
9,587.136,180.156,8.411,-17.902


In [76]:
def chisquare_test(column_name):
    expected=list()
    total=abs(df_data[column_name]*1000)
    total=total.astype(str)
    total=total[total!='nan']
    total=total.str[0]
    total=total.astype(int)
    total=total[total!=0]
    total_first_digit=total.value_counts()
    for i in range(9):
        expected.append(total_first_digit.sum()*Benchmark[i])
    result=chisquare(f_obs=total_first_digit,f_exp=expected,axis=0)
    return result.statistic

print 'statistics of total assets is {:}'.format(chisquare_test('atq'))
print 'statistics of total revenue is {:}'.format(chisquare_test('revtq'))
print 'statistics of net income is {:}'.format(chisquare_test('niq'))
print 'statistics of net operating cash flows is {:}'.format(chisquare_test('oancfy'))

statistics of total assets is 91.7237141628
statistics of total revenue is 79.9212924459
statistics of net income is 22.5422915693
statistics of net operating cash flows is 13.1624081965


Conclusion: The CEO of the companies more like to manipulate total assets and total revenue than net income and 
net operating cash flows. The reason probably is that when evaluate the performance of the CEOs, 
total assets and total revenue are more important than net income and net operating cash flows.

In [77]:
from IPython.display import HTML

hide_me = ''
HTML('''<script>
code_show=true; 
function code_toggle() {
  if (code_show) {
    $('div.input').each(function(id) {
      el = $(this).find('.cm-variable:first');
      if (id == 0 || el.text() == 'hide_me') {
        $(this).hide();
      }
    });
    $('div.output_prompt').css('opacity', 0);
  } else {
    $('div.input').each(function(id) {
      $(this).show();
    });
    $('div.output_prompt').css('opacity', 1);
  }
  code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input style="opacity:0" type="submit" value="Click here to toggle on/off the raw code."></form>''')