In [43]:
import pandas as pd
import numpy as np
import json
from results.data_processing import DataProcessor

In [44]:
data = pd.read_csv('data/data_FraudDetection_JAR2020.csv')
data_obj = DataProcessor(data,(1990,1999), (2000,2001), (2003,2014), 5)

In [3]:
with open('results/features.json') as json_file:
    features_comp = json.load(json_file)

In [13]:
from imblearn.under_sampling import RandomUnderSampler
def under_sample(X_train, y_train):
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    return X_train_resampled, y_train_resampled

#### Stats

In [4]:
data.shape

(146045, 46)

In [45]:
data['misstate'].value_counts()

0    145081
1       964
Name: misstate, dtype: int64

**Records : 146045**

**Features: 28+14+misstate(prdeictive Variable)+year+gvkey+cik+p_aaer**

In [5]:
data.columns

Index(['fyear', 'gvkey', 'p_aaer', 'misstate', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale',
       'sstk', 'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec',
       'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm',
       'dpi', 'reoa', 'EBIT', 'ch_fcf'],
      dtype='object')

In [7]:
train_period,validation_period = (1990,1999),(2000,2001)
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

train_data, validation_data, test_data = data_obj.split_data_periods(train_period,(2003,2005))

print("Train Period:",train_period)
print(train_data.shape)
print(train_data['misstate'].value_counts())
print('-----------------------------')
print("Val_period:",validation_period)
print(validation_data.shape)
print(validation_data['misstate'].value_counts())

Train Period: (1990, 1999)
(58634, 46)
0    58287
1      347
Name: misstate, dtype: int64
-----------------------------
Val_period: (2000, 2001)
(13114, 46)
0    12947
1      167
Name: misstate, dtype: int64


In [14]:
print("After Under Sampling")
X_train_resampled, y_train_resampled = under_sample(train_data.drop(columns=['misstate']), train_data['misstate'])
print(X_train_resampled.shape)
print(y_train_resampled.shape)
print(y_train_resampled.value_counts())

After Under Sampling
(694, 45)
(694,)
0    347
1    347
Name: misstate, dtype: int64


In [9]:
for tp in test_periods:
    train_data, validation_data, test_data = data_obj.split_data_periods(train_period,tp)
    print(tp)
    print('--------------')
    print(test_data.shape)
    print(test_data['misstate'].value_counts())
    print('-----------')
    

(2003, 2005)
--------------
(17778, 46)
0    17606
1      172
Name: misstate, dtype: int64
-----------
(2003, 2008)
--------------
(35166, 46)
0    34905
1      261
Name: misstate, dtype: int64
-----------
(2003, 2011)
--------------
(51326, 46)
0    50987
1      339
Name: misstate, dtype: int64
-----------
(2003, 2014)
--------------
(68230, 46)
0    67857
1      373
Name: misstate, dtype: int64
-----------


### Null Values

In [26]:
print("Null values from 28 raw financial items:,\n")
print(data[features_comp['28 Raw Financial Items']].isna().sum())
print("\n\n\nNull values from 14 financial ratios:")
data[features_comp['14 Financial Ratios']].isna().sum()


Null values from 28 raw financial items:,

act       0
ap        0
at        0
ceq       0
che       0
cogs      0
csho      0
dlc       0
dltis     0
dltt      0
dp        0
ib        0
invt      0
ivao      0
ivst      0
lct       0
lt        0
ni        0
ppegt     0
pstk      0
re        0
rect      0
sale      0
sstk      0
txp       0
txt       0
xint      0
prcc_f    0
dtype: int64



Null values from 14 financial ratios:


dch_wc          4759
ch_rsst         4851
dch_rec         4743
dch_inv         4615
soft_assets      592
ch_cs          15918
ch_cm          17107
ch_roa         12678
bm                18
dpi             9228
reoa             591
EBIT             591
ch_fcf          5407
issue              0
dtype: int64

### Batch 

In [24]:
train_period,validation_period = (1990,1999),(2000,2001)
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

In [29]:
train_data, validation_data, test_data = data_obj.split_data_periods(train_period,(2003,2005))

print("Train Period:",train_period)
print("Null values from 28 raw financial items:,\n")
print(train_data[features_comp['28 Raw Financial Items']].isna().sum())
print("\n\n\nNull values from 14 financial ratios:")
print(train_data[features_comp['14 Financial Ratios']].isna().sum())

print("Val_period:",validation_period)
print("Null values from 28 raw financial items:,\n")
print(validation_data[features_comp['28 Raw Financial Items']].isna().sum())
print("\n\n\nNull values from 14 financial ratios:")
print(validation_data[features_comp['14 Financial Ratios']].isna().sum())


Train Period: (1990, 1999)
Null values from 28 raw financial items:,

act       0
ap        0
at        0
ceq       0
che       0
cogs      0
csho      0
dlc       0
dltis     0
dltt      0
dp        0
ib        0
invt      0
ivao      0
ivst      0
lct       0
lt        0
ni        0
ppegt     0
pstk      0
re        0
rect      0
sale      0
sstk      0
txp       0
txt       0
xint      0
prcc_f    0
dtype: int64



Null values from 14 financial ratios:
dch_wc         1622
ch_rsst        1649
dch_rec        1545
dch_inv        1551
soft_assets      59
ch_cs          6537
ch_cm          6839
ch_roa         5909
bm                9
dpi            2405
reoa             59
EBIT             59
ch_fcf         1856
issue             0
dtype: int64
Val_period: (2000, 2001)
Null values from 28 raw financial items:,

act       0
ap        0
at        0
ceq       0
che       0
cogs      0
csho      0
dlc       0
dltis     0
dltt      0
dp        0
ib        0
invt      0
ivao      0
ivst      0

In [28]:
for tp in test_periods:
    print(tp)
    train_data, validation_data, test_data = data_obj.split_data_periods(train_period,tp)
    print("Null values from 28 raw financial items:,\n")
    print(test_data[features_comp['28 Raw Financial Items']].isna().sum())
    print("\n\n\nNull values from 14 financial ratios:")
    print(test_data[features_comp['14 Financial Ratios']].isna().sum())

(2003, 2005)
Null values from 28 raw financial items:,

act       0
ap        0
at        0
ceq       0
che       0
cogs      0
csho      0
dlc       0
dltis     0
dltt      0
dp        0
ib        0
invt      0
ivao      0
ivst      0
lct       0
lt        0
ni        0
ppegt     0
pstk      0
re        0
rect      0
sale      0
sstk      0
txp       0
txt       0
xint      0
prcc_f    0
dtype: int64



Null values from 14 financial ratios:
dch_wc          445
ch_rsst         455
dch_rec         448
dch_inv         425
soft_assets     130
ch_cs          1429
ch_cm          1642
ch_roa          911
bm                1
dpi            1192
reoa            130
EBIT            130
ch_fcf          549
issue             0
dtype: int64
(2003, 2008)
Null values from 28 raw financial items:,

act       0
ap        0
at        0
ceq       0
che       0
cogs      0
csho      0
dlc       0
dltis     0
dltt      0
dp        0
ib        0
invt      0
ivao      0
ivst      0
lct       0
lt        0
n

**we filled all null values with zeroes**

----------


### Window Processing

-------------

In [47]:
train_batches,test_batches = data_obj.create_batches()

for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    print(train_period,test_period)
    train_data, validation_data, test_data = data_obj.split_data_periods(train_period,test_period)
    print('--------------')
    print(train_data.shape)
    print(train_data['misstate'].value_counts())
    print('-----------')
    print('--------------')
    print(test_data.shape)
    print(test_data['misstate'].value_counts())
    print('-----------')
    
    print('----------'*50)

(1990, 1995) (1996, 2001)
--------------
(31556, 46)
0    31413
1      143
Name: misstate, dtype: int64
-----------
--------------
(40192, 46)
0    39821
1      371
Name: misstate, dtype: int64
-----------
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1996, 2001) (2002, 2006)
--------------
(40192, 46)
0    39821
1      371
Name: misstate, dtype: int64
-----------
--------------
(29753, 46)
0    29471
1      282
Name: misstate, dtype: int64
-----------
---------------------------------------------------------------------------------------

#### Records

In [36]:
import ast

In [34]:
fc_data = pd.read_csv("data/data_fraud_constraint.csv")

In [38]:
ast.literal_eval(fc_data['target_word_frequency'][0])

{'recklessness': 0,
 'guarantor': 0,
 'hazard': 0,
 'constrains': 0,
 'despicable': 0,
 'refusal': 0,
 'restitution': 0,
 'dependance': 0,
 'constrain': 0,
 'prevented': 0,
 'pledges': 0,
 'fiasco': 0,
 'conspiracy': 0,
 'illegal': 0,
 'prohibits': 0,
 'restrictive': 0,
 'entailed': 0,
 'circumstance': 0,
 'immorality': 0,
 'prosecution': 0,
 'ingratitude': 0,
 'publicity': 0,
 'imposture': 0,
 'noncancelable': 3,
 'compensation': 2,
 'disruption': 0,
 'dispute': 0,
 'solicitor': 0,
 'offence': 0,
 'distress': 0,
 'depending': 1,
 'alienation': 0,
 'inconvenience': 0,
 'complaint': 0,
 'constraining': 0,
 'policy': 6,
 'tres': 0,
 'precludes': 0,
 'helplessness': 0,
 'vexatious': 0,
 'prohibit': 0,
 'prudence': 0,
 'restrict': 0,
 'compulsory': 0,
 'unscrupulous': 0,
 'unlawful': 0,
 'depend': 2,
 'coercion': 0,
 'injury': 0,
 'owner': 0,
 'blindness': 0,
 'involvement': 0,
 'debtor': 0,
 'mistreatment': 0,
 'indifference': 0,
 'accidental': 0,
 'confines': 0,
 'satisfaction': 0,
 'res

In [44]:
def filter_(string):
    my_dict = ast.literal_eval(string)
    filtered_dict = {key: value for key, value in my_dict.items() if value > 0}

    total_count = sum(filtered_dict.values())
    unique_count = len(filtered_dict)
    return total_count,unique_count


In [49]:
result = fc_data['target_word_frequency'].apply(filter_)
total = result.apply(lambda x: x[0])
unique = result.apply(lambda x: x[1]) 

In [50]:
fc_data['total'] = total
fc_data['unique'] = unique

In [52]:
fc_data.to_csv('data/text_feat_count.csv',index=False)

---------------

### Fraud Count

In [2]:
import pandas as pd

In [25]:
bao_data2 = pd.read_csv("data/bao_all_data2.csv")

In [40]:
bao_data1.columns

Index(['fyear', 'gvkey', 'p_aaer', 'misstate', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale',
       'sstk', 'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec',
       'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm',
       'dpi', 'reoa', 'EBIT', 'ch_fcf'],
      dtype='object')

In [13]:
bao_data1 = pd.read_csv("data/data_FraudDetection_JAR2020.csv")

In [39]:
bao_data1.isna().sum()

fyear               0
gvkey               0
p_aaer         145081
misstate            0
act                 0
ap                  0
at                  0
ceq                 0
che                 0
cogs                0
csho                0
dlc                 0
dltis               0
dltt                0
dp                  0
ib                  0
invt                0
ivao                0
ivst                0
lct                 0
lt                  0
ni                  0
ppegt               0
pstk                0
re                  0
rect                0
sale                0
sstk                0
txp                 0
txt                 0
xint                0
prcc_f              0
dch_wc           4759
ch_rsst          4851
dch_rec          4743
dch_inv          4615
soft_assets       592
ch_cs           15918
ch_cm           17107
ch_roa          12678
issue               0
bm                 18
dpi              9228
reoa              591
EBIT              591
ch_fcf    

In [42]:
for year in range(1991,2023):
    print(year, bao_data1[bao_data1['fyear']==year]['gvkey'].count(), bao_data1[(bao_data1['fyear']==year) & (bao_data1['misstate']==1)]['gvkey'].count())

#     print(bao_data1[bao_data1['fyear']==year]['gvkey'].unique())
#     print(bao_data1[bao_data1['fyear']==year]['gvkey'].count())


1991 4713 27
1992 4970 26
1993 5377 30
1994 5684 23
1995 6230 22
1996 6745 33
1997 6789 42
1998 6716 56
1999 6828 73
2000 6752 86
2001 6362 81
2002 6067 77
2003 5981 69
2004 5934 58
2005 5863 45
2006 5908 33
2007 5868 30
2008 5612 26
2009 5367 31
2010 5389 26
2011 5404 21
2012 5630 19
2013 5647 11
2014 5627 4
2015 0 0
2016 0 0
2017 0 0
2018 0 0
2019 0 0
2020 0 0
2021 0 0
2022 0 0


In [33]:
main_1990 = set(bao_data1[bao_data1['fyear']==1990]['gvkey'])

In [34]:
recenet_1990 = set(bao_data2[bao_data2['fyear']==1990]['gvkey'])

In [35]:
recenet_1990.difference(main_1990)

{61873,
 8207,
 16402,
 16403,
 8215,
 16407,
 24607,
 24608,
 16417,
 8226,
 16419,
 24610,
 24612,
 24614,
 24617,
 24620,
 8237,
 24621,
 8239,
 8240,
 24623,
 24625,
 8245,
 24629,
 16439,
 24634,
 24636,
 8253,
 16445,
 24639,
 24640,
 24643,
 24645,
 24646,
 8265,
 24651,
 24652,
 16462,
 8271,
 24654,
 24655,
 8274,
 24656,
 24657,
 24659,
 24660,
 24661,
 24662,
 16473,
 24664,
 24667,
 16476,
 16477,
 24668,
 8287,
 16479,
 24669,
 8290,
 8291,
 16484,
 24671,
 24675,
 24676,
 24677,
 24678,
 24679,
 8299,
 16492,
 24680,
 24685,
 8303,
 16496,
 24686,
 24689,
 24690,
 24692,
 8310,
 16503,
 16504,
 24695,
 24697,
 24699,
 24701,
 24703,
 24704,
 24706,
 24707,
 16516,
 24708,
 8326,
 24710,
 24711,
 24712,
 8330,
 24713,
 24715,
 24716,
 24720,
 24721,
 24723,
 24725,
 8344,
 24728,
 24731,
 16551,
 24743,
 16553,
 8363,
 24747,
 16557,
 8366,
 16559,
 24749,
 24750,
 24751,
 24754,
 24755,
 24758,
 24760,
 24761,
 24762,
 16571,
 16572,
 16573,
 24764,
 16575,
 16576,
 24766