In [5]:
import os
import numpy as np
import pandas as pd
import sys
import math
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv(os.path.join('LIBOR_GOFO_GLR.CSV'))

In [7]:
df.head()

Unnamed: 0,Date,GOFO_1M,GOFO_2M,GOFO_3M,GOFO_6M,GOFO_12M,LIBOR_1M,LIBOR_2M,LIBOR_3M,LIBOR_6M,LIBOR_12M,GLR_1M,GLR_2M,GLR_3M,GLR_6M,GLR_12M
0,2012-12-31,0.294,0.324,0.348,0.418,0.466,0.2087,0.2535,0.306,0.50825,0.8435,-0.0853,-0.0705,-0.042,0.09025,0.3775
1,2012-12-28,0.284,0.32,0.346,0.424,0.474,0.2097,0.2535,0.308,0.50825,0.8435,-0.0743,-0.0665,-0.038,0.08425,0.3695
2,2012-12-27,0.306,0.332,0.358,0.428,0.474,0.2117,0.255,0.311,0.51025,0.8435,-0.0943,-0.077,-0.047,0.08225,0.3695
3,2012-12-24,0.322,0.346,0.378,0.436,0.484,0.2097,0.254,0.31,0.51025,0.843,-0.1123,-0.092,-0.068,0.07425,0.359
4,2012-12-21,0.32333,0.36,0.38833,0.44667,0.48167,0.2097,0.254,0.31,0.51025,0.843,-0.11363,-0.106,-0.07833,0.06358,0.36133


In [8]:
#Data exploratory       
df.describe()
df.info()
df.describe().transpose
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2779 entries, 0 to 2778
Data columns (total 16 columns):
Date         2779 non-null object
GOFO_1M      2779 non-null float64
GOFO_2M      2779 non-null float64
GOFO_3M      2779 non-null float64
GOFO_6M      2779 non-null float64
GOFO_12M     2779 non-null float64
LIBOR_1M     2779 non-null float64
LIBOR_2M     2779 non-null float64
LIBOR_3M     2779 non-null float64
LIBOR_6M     2779 non-null float64
LIBOR_12M    2779 non-null float64
GLR_1M       2779 non-null float64
GLR_2M       2779 non-null float64
GLR_3M       2779 non-null float64
GLR_6M       2779 non-null float64
GLR_12M      2020 non-null float64
dtypes: float64(15), object(1)
memory usage: 347.5+ KB


Date           0
GOFO_1M        0
GOFO_2M        0
GOFO_3M        0
GOFO_6M        0
GOFO_12M       0
LIBOR_1M       0
LIBOR_2M       0
LIBOR_3M       0
LIBOR_6M       0
LIBOR_12M      0
GLR_1M         0
GLR_2M         0
GLR_3M         0
GLR_6M         0
GLR_12M      759
dtype: int64

In [9]:
#we create a function which output is the final counts, and the frequency of each count as a
#percentage, are returned as lists to use in subsequent functions.
def count_first_digit(data_str):#TAKE AS AN ARGUMENT A STR-COLUMN NAME
    mask=df[data_str]>1.
    data=list(df[mask][data_str])
    for i in range(len(data)):
        while data[i]>10:
            data[i]=data[i]/10
    first_digits=[int(x) for x in sorted(data)]
    unique=(set(first_digits))#a list with unique values of first_digit list
    data_count=[]
    for i in unique:
        count=first_digits.count(i)
        data_count.append(count)
    total_count=sum(data_count)
    data_percentage=[(i/total_count)*100 for i in data_count]
    return  total_count,data_count, data_percentage
    
    # Benford's Law percentages for leading digits 1-9
BENFORD = [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6]

In [10]:
total_count,data_count, data_percentage=count_first_digit('GOFO_1M')

In [11]:

def get_expected_counts(total_count):

    """Return list of expected Benford's Law counts for total sample count."""

    return [round(p * total_count / 100) for p in BENFORD]
expected_counts=get_expected_counts(total_count)

def chi_square_test(data_count,expected_counts):

    """Return boolean on chi-square test (8 degrees of freedom & P-val=0.05)."""

    chi_square_stat = 0  # chi square test statistic

    for data, expected in zip(data_count,expected_counts):

        chi_square = math.pow(data - expected, 2)

        chi_square_stat += chi_square / expected

    print("\nChi-squared Test Statistic = {:.3f}".format(chi_square_stat))

    print("Critical value at a P-value of 0.05 is 15.51.")    

    return chi_square_stat < 15.51
chi_square_test(data_count,expected_counts)


Chi-squared Test Statistic = 445.463
Critical value at a P-value of 0.05 is 15.51.


False

In [12]:
   #1st_bar_chart
def bar_chart(data_pct):

    """Make bar chart of observed vs expected 1st digit frequency in percent."""

    fig, ax = plt.subplots()
    index = [i + 1 for i in range(len(data_pct))]  # 1st digits for x-axis
# text for labels, title and ticks

    fig.canvas.set_window_title('Percentage First Digits')

    ax.set_title('Data vs. Benford Values', fontsize=15)

    ax.set_ylabel('Frequency (%)', fontsize=16)

    ax.set_xticks(index)

    ax.set_xticklabels(index, fontsize=14)
    # build bars    

    rects = ax.bar(index, data_pct, width=0.95, color='black', label='Data')



# attach a text label above each bar displaying its height

    for rect in rects:

        height = rect.get_height()

        ax.text(rect.get_x() + rect.get_width()/2, height,

                '{:0.1f}'.format(height), ha='center', va='bottom', 

                fontsize=13)
        # plot Benford values as red dots

    ax.scatter(index, BENFORD, s=150, c='red', zorder=2, label='Benford')



    # Hide the right and top spines & add legend

    ax.spines['right'].set_visible(False)

    ax.spines['top'].set_visible(False)

    ax.legend(prop={'size':15}, frameon=False)

    

    plt.show()
    
    #2nd_bar_chart
    labels=list(data_percentage)
    width = 0.35 
    x = np.arange(len(data_percentage)) # the label locations
    width = 0.35  # the width of the bars
    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width, data_percentage, width=0.95, color='black', label='Data')
    rects2 = ax.bar(x + width, BENFORD,width,label='Benford')
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Frequency (%)', fontsize=16)
    ax.set_title('Benford')
    ax.set_xticks(x)
    ax.legend()
    plt.show()

In [13]:
#specify the main() function and runs the program & prints some statistics.
def main(data_list):
    total_count,data_count, data_percentage= count_first_digit(data_list)

    expected_counts = get_expected_counts(total_count)

    print("\nobserved counts = {}".format(data_count))

    print("expected counts = {}".format(expected_counts), "\n")


    print("First Digit Probabilities:")

    for i in range(1, len(data_percentage)+1):

        print("{}: observed: {:.3f}  expected: {:.3f}".

              format(i, data_percentage[i - 1] / 100, BENFORD[i - 1] / 100))



    if chi_square_test(data_count, expected_counts):

        print("Observed distribution matches expected distribution.")

    else:

        print("Observed distribution does not match expected.", file=sys.stderr)       



    ##bar_chart(data_percentage)
    

In [18]:
main('1M_NEW')


observed counts = [770, 561, 541, 354, 481, 36, 9, 9, 11, 2]
expected counts = [835, 488, 347, 269, 219, 186, 161, 141, 128] 

First Digit Probabilities:
1: observed: 0.278  expected: 0.301
2: observed: 0.202  expected: 0.176
3: observed: 0.195  expected: 0.125
4: observed: 0.128  expected: 0.097
5: observed: 0.173  expected: 0.079
6: observed: 0.013  expected: 0.067
7: observed: 0.003  expected: 0.058
8: observed: 0.003  expected: 0.051
9: observed: 0.004  expected: 0.046


IndexError: list index out of range

In [15]:
df['1M_NEW']= df['GOFO_1M']*10000

In [16]:
df.columns

Index(['Date', 'GOFO_1M', 'GOFO_2M', 'GOFO_3M', 'GOFO_6M', 'GOFO_12M',
       'LIBOR_1M', 'LIBOR_2M', 'LIBOR_3M', 'LIBOR_6M', 'LIBOR_12M', 'GLR_1M',
       'GLR_2M', 'GLR_3M', 'GLR_6M', 'GLR_12M', '1M_NEW'],
      dtype='object')

In [17]:
for col in df.columns:
    print(col)
    if col =='Date': continue
    df['new']= df[col]*10
    main(df['new'])

Date
GOFO_1M


KeyError: "None of [Float64Index([              2.94,               2.84,               3.06,\n                            3.22,             3.2333, 3.4600000000000004,\n              3.4799999999999995, 3.4200000000000004, 3.4499999999999997,\n              3.4000000000000004,\n              ...\n                            12.5,               12.4,               12.9,\n                            12.8, 13.200000000000001, 13.200000000000001,\n              13.600000000000001,               13.4, 13.200000000000001,\n                            12.5],\n             dtype='float64', length=2779)] are in the [columns]"

In [19]:
main(df['1M 2012 2/2']*100)

KeyError: '1M 2012 2/2'